All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.search.TokenBuffer Maven / Gradle / Ivy

Go to download

Blazegraph(TM) DB Core Platform. It contains all Blazegraph DB dependencies other than Blueprints.

There is a newer version: 2.1.4
Show newest version
package com.bigdata.search;

import java.util.Arrays;
import java.util.Map;

import org.apache.log4j.Logger;

import com.bigdata.btree.ITupleSerializer;
import com.bigdata.btree.keys.KV;
import com.bigdata.btree.proc.LongAggregator;

/**
 * A buffer holding tokens extracted from one or more documents / fields.
 * Each entry in the buffer corresponds to the {@link TermFrequencyData}
 * extracted from a field of some document. When the buffer overflows it is
 * {@link #flush()}, writing on the indices.
 * 
 * @param  The generic type of the document identifier.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class TokenBuffer> {

    final private static Logger log = Logger.getLogger(TokenBuffer.class);

    /** The object on which the buffer will write when it overflows. */
    private FullTextIndex textIndexer;
    
    /** The capacity of the {@link #buffer}. */
    private final int capacity;
    
    /** Each entry models a field of some document. */
    private final TermFrequencyData[] buffer;
    
    /** #of entries in the {@link #buffer}. */
    private int count = 0;
    
    /** #of distinct {docId} values in the {@link #buffer}. */
    private int ndocs;
    
    /** #of distinct {docId,fieldId} values in the {@link #buffer}. */
    private int nfields;
    
    /** #of distinct {docId,fieldId,termText} tuples.*/
    private int nterms;
    
    /** The last observed docId and null if none observed. */
    private V lastDocId;

    /** The last observed fieldId and -1 if none observed. */
    private long lastFieldId;

    /**
     * Ctor.
     * @param capacity
     *            The #of distinct {document,field} tuples that can be held in
     *            the buffer before it will overflow. The buffer will NOT
     *            overflow until you exceed this capacity.
     * 
     * @param textIndexer
     *            The object on which the buffer will write when it overflows or
     *            is {@link #flush()}ed.
     */
    @SuppressWarnings("unchecked")
    public TokenBuffer(final int capacity, final FullTextIndex textIndexer) {
        
        if (capacity <= 0)
            throw new IllegalArgumentException();

        if (textIndexer == null)
            throw new IllegalArgumentException();
        
        this.capacity = capacity;
        
        this.textIndexer = textIndexer;
        
        buffer = new TermFrequencyData[capacity];
        
        reset();
        
    }
    
    /**
     * Discards all data in the buffer and resets it to a clean state.
     */
    public void reset() {
        
        for (int i = 0; i < count; i++) {

            buffer[i] = null;

        }
        
        count = 0;
        
        ndocs = 0;
        
        nfields = 0;
        
        nterms = 0;

        lastDocId = null;
        
        lastFieldId = -1;
        
    }
    
    /**
     * The #of entries in the buffer.
     */
    public int size() {
        
        return count;
        
    }

    /**
     * Return the {@link TermFrequencyData} for the specified index.
     * 
     * @param index
     *            The index in [0:count).
     *            
     * @return The {@link TermFrequencyData} at that index.
     * 
     * @throws IndexOutOfBoundsException
     */
    public TermFrequencyData get(final int index) {
        
        if (index < 0 || index >= count)
            throw new IndexOutOfBoundsException(); 
        
        return buffer[index];
        
    }
    
    /**
     * Adds another token to the current field of the current document. If
     * either the field or the document identifier changes, then begins a
     * new field and possibly a new document. If the buffer is full then it
     * will be {@link #flush()}ed before beginning a new field.
     * 

* Note: This method is NOT thread-safe. *

* Note: There is an assumption that the caller will process all tokens * for a given field in the same document at once. Failure to do this * will lead to only part of the term-frequency distribution for the * field being captured by the indices. * * @param docId * The document identifier. * @param fieldId * The field identifier. * @param token * The token. */ public void add(final V docId, final int fieldId, final String token) { if (log.isDebugEnabled()) { log.debug("docId=" + docId + ", fieldId=" + fieldId + ", token=" + token); } final boolean newField; if (count == 0) { // first tuple in clean buffer. ndocs++; nfields++; lastDocId = docId; lastFieldId = fieldId; newField = true; } else { if (lastDocId != docId) { // start of new document ndocs++; // also start of new field. nfields++; newField = true; // normalize the last term-frequency distribution. buffer[count-1].normalize(); } else if (lastFieldId != fieldId) { // start of new field in same document. nfields++; newField = true; // normalize the last term-frequency distribution. buffer[count-1].normalize(); } else { newField = false; } } if (newField && count == capacity) { flush(); } if(newField) { buffer[count++] = new TermFrequencyData(docId, fieldId, token); nterms++; } else { if( buffer[count-1].add(token) ) { nterms++; } } lastDocId = docId; lastFieldId = fieldId; } /** * Write any buffered data on the indices. *

* Note: The writes on the terms index are scattered since the key for the * index is {term, docId, fieldId}. This method will batch up and then apply * a set of updates, but the total operation is not atomic. Therefore search * results which are concurrent with indexing may not have access to the * full data for concurrently indexed documents. This issue may be resolved * by allowing the indexer to write ahead and using a historical commit time * for the search. *

* Note: If a document is pre-existing, then the existing data for that * document MUST be removed unless you know that the fields to be found in * the will not have changed (they may have different contents, but the same * fields exist in the old and new versions of the document). */ public void flush() { if (nterms == 0) { reset(); return; } if (log.isInfoEnabled()) log.info("count=" + count + ", ndocs=" + ndocs + ", nfields=" + nfields + ", nterms=" + nterms); // Normalize the last document/field in the buffer buffer[count - 1].normalize(); /* * Generate keys[] and vals[]. */ // array of correlated key/value tuples. final KV[] a = new KV[nterms]; // Knows how to encode and decode the keys and values. @SuppressWarnings("unchecked") final ITupleSerializer, ITermDocVal> tupleSer = textIndexer .getIndex().getIndexMetadata().getTupleSerializer(); // #of {token,docId,fieldId} tuples generated int n = 0; // for each document in the buffer. for (int i = 0; i < count; i++) { final TermFrequencyData termFreq = buffer[i]; final V docId = termFreq.docId; final int fieldId = termFreq.fieldId; // emit {token,docId,fieldId} tuples. for(Map.Entry e : termFreq.terms.entrySet()) { final String termText = e.getKey(); final ITermMetadata termMetadata = e.getValue(); // final byte[] key = recordBuilder.getKey(keyBuilder, termText, // false/* successor */, docId, fieldId); /* * Note: This wraps both sides of the record together and passes * them into the tupleSerializer so it can examine both pieces * when making its decision on how to encode the information * into the key/val of the index. */ final ITermDocRecord rec = new ReadOnlyTermDocRecord( termText, docId, fieldId, /* termMetadata.termFreq(), */ termMetadata.getLocalTermWeight()); final byte[] key = tupleSer.serializeKey(rec); // final byte[] val = recordBuilder.getValue(buf, termMetadata); final byte[] val = tupleSer.serializeVal(rec); if (log.isDebugEnabled()) { log.debug("{" + termText + "," + docId + "," + fieldId + "}: #occurences=" + termMetadata.termFreq() + ", termWeight=" + termMetadata.getLocalTermWeight()); } // save in the correlated array. a[n++] = new KV(key, val); } } assert n == nterms : "ntokens=" + nterms + ", n=" + n; // Sort {term,docId,fieldId}:{value} tuples into total index order. Arrays.sort(a); /* * Copy the correlated key:val data into keys[] and vals[] arrays. */ final byte[][] keys = new byte[nterms][]; final byte[][] vals = new byte[nterms][]; for (int i = 0; i < nterms; i++) { keys[i] = a[i].key; vals[i] = a[i].val; } // Batch write on the index. writeOnIndex(n, keys, vals); // Clear the buffer. reset(); } // static private class ReadOnlyTermDocKey> implements // ITermDocKey { // // private final String token; // // private final V docId; // // private final int fieldId; // // private ReadOnlyTermDocKey(final String token, final V docId, // final int fieldId) { // this.token = token; // this.docId = docId; // this.fieldId = fieldId; // } // // public String getToken() { // return token; // } // // public V getDocId() { // return docId; // } // // public int getFieldId() { // return fieldId; // } // // } /** * Writes on the index. * * @param n * @param keys * @param vals * * @return The #of pre-existing records that were updated. */ protected long writeOnIndex(final int n, final byte[][] keys, final byte[][] vals) { final LongAggregator resultHandler = new LongAggregator(); textIndexer.getIndex().submit(0, //fromIndex n, // toIndex keys,// vals,// (textIndexer.isOverwrite() // ? TextIndexWriteProc.IndexWriteProcConstructor.OVERWRITE : TextIndexWriteProc.IndexWriteProcConstructor.NO_OVERWRITE// ),// resultHandler// ); return resultHandler.getResult(); } /** * Writes on the index. * * @param n * @param keys * @param vals * * @return The #of pre-existing records that were updated. */ protected long deleteFromIndex(final int n, final byte[][] keys, final byte[][] vals) { throw new RuntimeException("implement me"); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy