com.bigdata.search.FullTextIndex Maven / Gradle / Ivy

Go to download
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Jan 23, 2008
 */

package com.bigdata.search;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexTypeEnum;
import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.btree.raba.codec.EmptyRabaValueCoder;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.IResourceLock;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.rdf.lexicon.ITextIndexer.FullTextQuery;
import com.bigdata.relation.AbstractRelation;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.concurrent.ExecutionHelper;

/**
 * Full text indexing and search support.
 * 
 * The basic data model consists of documents, fields in documents, and tokens
 * extracted by an analyzer from those fields.
 * 

 * The frequency distributions may be normalized to account for a variety of
 * effects producing "term weights". For example, normalizing for document
 * length or relative frequency of a term in the overall collection. Therefore
 * the logical model is:
 * 
 * 
 * 
 *             token : {docId, freq?, weight?}+
 * 
 * 
 * 
 * (For RDF, docId is the term identifier as assigned by the term:id index.)
 * 
 * The freq and weight are optional values that are representative of the kinds
 * of statistical data that are kept on a per-token-document basis. The freq is
 * the token frequency (the frequency of occurrence of the token in the
 * document). The weight is generally a normalized token frequency weight for
 * the token in that document in the context of the overall collection.
 * 

 * In fact, we actually represent the data as follows:
 * 
 * 
 * 
 *             {sortKey(token), weight, docId, fldId} : {freq?, sorted(pos)+}
 * 
 * 
 * 
 * That is, there is a distinct entry in the full text B+Tree for each field in
 * each document in which a given token was recognized. The text of the token is
 * not stored in the key, just the Unicode sort key generated from the token
 * text. The value associated with the B+Tree entry is optional - it is simply
 * not used unless we are storing statistics for the token-document pair. The
 * advantages of this approach are: (a) it reuses the existing B+Tree data
 * structures efficiently; (b) we are never faced with the possibility overflow
 * when a token is used in a large number of documents. The entries for the
 * token will simply be spread across several leaves in the B+Tree; (c) leading
 * key compression makes the resulting B+Tree very efficient; and (d) in a
 * scale-out range partitioned index we can load balance the resulting index
 * partitions by choosing the partition based on an even token boundary.
 * 
 * A field is any pre-identified text container within a document. Field
 * identifiers are integers, so there are 32^2 distinct possible
 * field identifiers. It is possible to manage the field identifiers through a
 * secondary index, but that has no direct bearing on the structure of the full
 * text index itself. Field identifies appear after the token in the key so that
 * queries may be expressed that will be matched against any field in the
 * document. Likewise, field identifiers occur before the document identifier in
 * the key since we always search across documents (in a search key, the
 * document identifier is always {@link Long#MIN_VALUE} and the field identifier
 * is always {@link Integer#MIN_VALUE}). There are many applications for fields:
 * for example, distinct fields may be used for the title, abstract, and full
 * text of a document or for the CDATA section of each distinct element in
 * documents corresponding to some DTD. The application is responsible for
 * recognizing the fields in the document and producing the appropriate token
 * stream, each of which must be tagged by the field.
 * 

 * A query is tokenized, producing a (possibly normalized) token-frequency
 * vector. The relevance of documents to the query is generally taken as the
 * cosine between the query's and each document's (possibly normalized)
 * token-frequency vectors. The main effort of search is assembling a token
 * frequency vector for just those documents with which there is an overlap with
 * the query. This is done using a key range scan for each token in the query
 * against the full text index.
 * 
 * 
 *             fromKey := token, Long.MIN_VALUE
 *             toKey   := successor(token), Long.MIN_VALUE
 * 
 * 
 * and extracting the appropriate token frequency, normalized token weight, or
 * other statistic. When no value is associated with the entry we follow the
 * convention of assuming a token frequency of ONE (1) for each document in
 * which the token appears.
 * 
 * Tokenization is informed by the language code (when declared) and by the
 * configured {@link Locale} for the database otherwise. An appropriate
 * {@link Analyzer} is chosen based on the language code or {@link Locale} and
 * the "document" is broken into a token-frequency distribution (alternatively a
 * set of tokens). The same process is used to tokenize queries, and the API
 * allows the caller to specify the language code used to select the
 * {@link Analyzer} to tokenize the query.
 * 

 * Once the tokens are formed the language code / {@link Locale} used to produce
 * the token is discarded (it is not represented in the index). The reason for
 * this is that we never utilize the total ordering of the full text index,
 * merely the manner in which it groups tokens that map onto the same Unicode
 * sort key together. Further, we use only a single Unicode collator
 * configuration regardless of the language family in which the token was
 * originally expressed. Unlike the collator used by the terms index (which
 * often is set at IDENTICAL strength), the collector used by the full text
 * index should be chosen such that it makes relatively few distinctions in
 * order to increase recall (e.g., set at PRIMARY strength). Since a total order
 * over the full text index is not critical from the perspective of its IR
 * application, the {@link Locale} for the collator is likewise not critical and
 * PRIMARY strength will produce significantly shorter Unicode sort keys.
 * 

 * The term frequency within that literal is an optional property associated
 * with each term identifier, as is the computed weight for the token in the
 * term.
 * 

 * Note: Documents should be tokenized using an {@link Analyzer} appropriate for
 * their declared language code (if any). However, once tokenized, the language
 * code is discarded and we perform search purely on the Unicode sort keys
 * resulting from the extracted tokens.
 * 
Scale-out
 * 
 * Because the first component in the key is the token, both updates (when
 * indexing document) and queries (reading against different tokens) will be
 * scattered across shards. Therefore it is not necessary to register a split
 * handler for the full text index.
 * 
 * @todo The key for the terms index is {term,docId,fieldId}. Since the data are
 *       not pre-aggregated by {docId,fieldId} we can not easily remove only
 *       those tuples corresponding to some document (or some field of some
 *       document).
 *       

 *       In order to removal of the fields for a document we need to know either
 *       which fields were indexed for the document and the tokens found in
 *       those fields and then scatter the removal request (additional space
 *       requirements) or we need to flood a delete procedure across the terms
 *       index (expensive).
 * 
 * @todo provide M/R alternatives for indexing or computing/updating global
 *       weights.
 * 
 * @todo Consider model in which fields are declared and then a "Document" is
 *       indexed. This lets us encapsulate the "driver" for indexing. The
 *       "field" can be a String or a Reader, etc.
 *       

 *       Note that lucene handles declaration of the data that will be stored
 *       for a field on a per Document basis {none, character offsets, character
 *       offsets + token positions}. There is also an option to store the term
 *       vector itself. Finally, there are options to store, compress+store, or
 *       not store the field value. You can also choose {None, IndexTokenized,
 *       IndexUntokenized} and an option dealing with norms.
 * 
 * @todo lucene {@link Analyzer}s may be problematic. For example, it is
 *       difficult to tokenize numbers. consider replacing the lucene
 *       analyzer/tokenizer with our own stuff. this might help with
 *       tokenization of numbers, etc. and with tokenization of native html or
 *       xml with intact offsets.
 * 
 * @todo lucene analyzers will strip stopwords by default. There should be a
 *       configuration option to strip out stopwords and another to enable
 *       stemming. how we do that should depend on the language family.
 *       Likewise, there should be support for language family specific stopword
 *       lists and language family specific exclusions.
 * 
 * @todo support more term weighting schemes and make them easy to configure.
 * 
 * @param 
 *            The generic type of the document identifier.
 * @author Bryan Thompson
 * @version $Id$
 */
public class FullTextIndex> extends AbstractRelation {

    final private static transient Logger log = Logger
            .getLogger(FullTextIndex.class);

    /**
     * The backing index.
     */
    volatile private IIndex ndx;

    /**
     * The index used to associate term identifiers with tokens parsed from
     * documents.
     */
    public IIndex getIndex() {
        
        if(ndx == null) {

            synchronized (this) {

                ndx = getIndex(getNamespace() + "." + NAME_SEARCH);

                if (ndx == null)
                    throw new IllegalStateException();
                
            }
            
        }
        
        return ndx;
        
    }
    
    /**
     * Options understood by the {@link FullTextIndex}.
     * 
     * @author Bryan Thompson
     */
    public interface Options {
       
        /**
         * indexer.overwrite - boolean option (default
         * true) controls the behavior when a write is requested
         * on the index and the {term,doc,field} tuple which forms the key is
         * already present in the index. When true, the new
         * value will be written on the index. When false, the
         * existing value will be retained. This option is an optimization which
         * makes sense when the corpus (a) only grows; and (b) the content of
         * the documents in the corpus never changes. For example, this is true
         * for an RDF database since the set of terms only grows and each term
         * is immutable.
         */
        String OVERWRITE = FullTextIndex.class.getName() + ".overwrite";

        String DEFAULT_OVERWRITE = "true";

        /**
         * Specify the collator {@link StrengthEnum strength} for the full-text
         * index (default {@value StrengthEnum#Primary}).
         * 

         * Note: {@link StrengthEnum#Primary} is generally what you want for a
         * full text index as search will consider tokens which differ in case
         * and other subtle features to be the same token (a 'match').
         * 
         * @see KeyBuilder.Options#STRENGTH
         */
        String INDEXER_COLLATOR_STRENGTH = FullTextIndex.class.getName()
                + ".collator.strength";

        String DEFAULT_INDEXER_COLLATOR_STRENGTH = StrengthEnum.Primary.toString();

        /**
         * The maximum time in milliseconds that the search engine will await
         * completion of the tasks reading on each of the query terms (default
         * {@value #DEFAULT_INDEXER_TIMEOUT}). A value of ZERO (0) means NO
         * timeout and is equivalent to a value of {@link Long#MAX_VALUE}. If
         * the timeout expires before all tasks complete then the search results
         * will only reflect partial information.
         */
        String INDEXER_TIMEOUT = FullTextIndex.class.getName() + ".timeout";

        String DEFAULT_INDEXER_TIMEOUT = "0";

        /**
         * When true, the fieldId is stored as part of
         * the key (default {@value #DEFAULT_FIELDS_ENABLED}). When
         * false, each key will be four bytes shorter. Applications
         * which do not use fieldId are should disable it when
         * creating the {@link FullTextIndex}.
         */
        String FIELDS_ENABLED = FullTextIndex.class.getName()
                + ".fieldsEnabled";

        String DEFAULT_FIELDS_ENABLED = "false";

//        /**
//         * When true, the localTermWeight is stored
//         * using double-precision. When false, it is stored using
//         * single-precision.
//         */
//        String DOUBLE_PRECISION = FullTextIndex.class.getName()
//                + ".doublePrecision";
//
//        String DEFAULT_DOUBLE_PRECISION = "false";
        
        /**
         * The name of the {@link IAnalyzerFactory} class which will be used to
         * obtain analyzers when tokenizing documents and queries (default
         * {@value #DEFAULT_ANALYZER_FACTORY_CLASS}).  The specified class MUST
         * implement {@link IAnalyzerFactory} and MUST have a constructor with
         * the following signature:
         * 
         * public MyAnalyzerFactory(FullTextIndexer indexer)
         * 
         */
        String ANALYZER_FACTORY_CLASS = FullTextIndex.class.getName()
                + ".analyzerFactoryClass";

        String DEFAULT_ANALYZER_FACTORY_CLASS = DefaultAnalyzerFactory.class.getName();
        
        /**
        * We keep a small hit cache based on search parameters: search string +
        * prefixMatch + matchAllTerms.  This defines the size of that cache.
        * The value should remain small.
        */
        String HIT_CACHE_SIZE = FullTextIndex.class.getName()
        		+ ".hitCacheSize";
        
        String DEFAULT_HIT_CACHE_SIZE = "10";
        
       /**
        * We keep a small hit cache based on search parameters: search string +
        * prefixMatch + matchAllTerms. This defines the timeout for values in
        * that cache (in milliseconds). The value should remain small.
        */
        String HIT_CACHE_TIMEOUT_MILLIS = FullTextIndex.class.getName()
                + ".hitCacheTimeoutMillis";
        
        /**
         * Default is 1 minute.
         */
        String DEFAULT_HIT_CACHE_TIMEOUT_MILLIS =
               String.valueOf(TimeUnit.MINUTES.toMillis(1));
        
    }
    
    /**
     * @see Options#OVERWRITE
     */
    private final boolean overwrite;
    
    /**
     * Return the value configured by the {@link Options#OVERWRITE} property.
     */
    public boolean isOverwrite() {
        
        return overwrite;
        
    }
    
    /**
     * @see Options#INDEXER_TIMEOUT
     */
    private final long timeout;
    
//    /**
//     * @see Options#FIELDS_ENABLED
//     */
//    private final boolean fieldsEnabled;
//
//    /**
//     * @see Options#DOUBLE_PRECISION
//     */
//    private final boolean doublePrecision;
//
//    /**
//     * Return the value configured by the {@link Options#FIELDS_ENABLED}
//     * property.
//     */
//    public boolean isFieldsEnabled() {
//        
//        return fieldsEnabled;
//        
//    }

    /**
     * @see Options#ANALYZER_FACTORY_CLASS
     */
    private final IAnalyzerFactory analyzerFactory;
    
    /**
     * See {@link Options#HIT_CACHE_SIZE}.
     */
    private final int hitCacheSize;

    /**
     * See {@link Options#HIT_CACHE_TIMEOUT_MILLIS}.
     */
    private final long hitCacheTimeoutMillis;

    /**
     * See {@link Options#HIT_CACHE_SIZE}.
     */
    private final ConcurrentWeakValueCacheWithTimeout[]> cache;

//    /**
//     * @see Options#DOCID_FACTORY_CLASS
//     */
//    private final IKeyBuilderExtension docIdFactory;
    
//    /**
//     * The concrete {@link IRecordBuilder} instance.
//     */
//    private final IRecordBuilder recordBuilder;
//    
//    /**
//     * Return the object responsible for encoding and decoding the tuples
//     * in the full text index.
//     */
//    public final IRecordBuilder getRecordBuilder() {
//        
//        return recordBuilder;
//        
//    }
    
    /**
     * The basename of the search index.
     */
    public static final transient String NAME_SEARCH = "search";
    
    /**
     * true unless {{@link #getTimestamp()} is {@link ITx#UNISOLATED}.
     */
    final public boolean isReadOnly() {
        
        return TimestampUtility.isReadOnly(getTimestamp());
        
    }

//    protected void assertWritable() {
//        
//        if(isReadOnly()) {
//            
//            throw new IllegalStateException("READ_ONLY");
//            
//        }
//        
//    }

    /**
     * Ctor specified by {@link DefaultResourceLocator}.
     * 
     * @param client
     *            The client. Configuration information is obtained from the
     *            client. See {@link Options}.
     * 
     * @see Options
     */
    public FullTextIndex(final IIndexManager indexManager,
            final String namespace, final Long timestamp,
            final Properties properties) {

        super(indexManager, namespace, timestamp, properties);
        
        {
            
            overwrite = Boolean.parseBoolean(properties.getProperty(
                    Options.OVERWRITE, Options.DEFAULT_OVERWRITE));

            if (log.isInfoEnabled())
                log.info(Options.OVERWRITE + "=" + overwrite);

        }

        {

            timeout = Long.parseLong(properties.getProperty(
                    Options.INDEXER_TIMEOUT, Options.DEFAULT_INDEXER_TIMEOUT));

            if (log.isInfoEnabled())
                log.info(Options.INDEXER_TIMEOUT + "=" + timeout);

        }

//        {
//
//            fieldsEnabled = Boolean.parseBoolean(properties.getProperty(
//                    Options.FIELDS_ENABLED, Options.DEFAULT_FIELDS_ENABLED));
//
//            if (log.isInfoEnabled())
//                log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled);
//
//        }
//
//        {
//
//            doublePrecision = Boolean
//                    .parseBoolean(properties.getProperty(
//                            Options.DOUBLE_PRECISION,
//                            Options.DEFAULT_DOUBLE_PRECISION));
//
//            if (log.isInfoEnabled())
//                log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision);
//
//        }

        {

            hitCacheSize = Integer.parseInt(properties.getProperty(
                    Options.HIT_CACHE_SIZE, Options.DEFAULT_HIT_CACHE_SIZE));

            if (log.isInfoEnabled())
                log.info(Options.HIT_CACHE_SIZE + "=" + hitCacheSize);

        }

        {

            hitCacheTimeoutMillis = Long.parseLong(properties.getProperty(
                    Options.HIT_CACHE_TIMEOUT_MILLIS,
                    Options.DEFAULT_HIT_CACHE_TIMEOUT_MILLIS));

            if (log.isInfoEnabled())
                log.info(Options.HIT_CACHE_TIMEOUT_MILLIS + "=" + hitCacheTimeoutMillis);

        }

        this.cache =
               new ConcurrentWeakValueCacheWithTimeout[]>(
                               hitCacheSize, hitCacheTimeoutMillis);

        {

            final String className = getProperty(
                    Options.ANALYZER_FACTORY_CLASS,
                    Options.DEFAULT_ANALYZER_FACTORY_CLASS);

            if (log.isInfoEnabled())
                log.info(Options.ANALYZER_FACTORY_CLASS + "=" + className);

            final Class cls;
            try {
                cls = (Class) Class.forName(className);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Bad option: "
                        + Options.ANALYZER_FACTORY_CLASS, e);
            }

            if (!IAnalyzerFactory.class.isAssignableFrom(cls)) {
                throw new RuntimeException(Options.ANALYZER_FACTORY_CLASS
                        + ": Must extend: " + IAnalyzerFactory.class.getName());
            }

            try {

                final Constructor ctor = cls
                        .getConstructor(new Class[] { FullTextIndex.class });

                // save reference.
                analyzerFactory = ctor.newInstance(new Object[] { this });

            } catch (Exception ex) {

                throw new RuntimeException(ex);

            }

        }

        /*
         * Note: defer resolution of the index.
         */
//        // resolve index (might not exist, in which case this will be null).
//        ndx = getIndex(getNamespace()+"."+NAME_SEARCH);
        
    }

    /**
     * Conditionally registers the necessary index(s).
     * 
     * @throws IllegalStateException
     *             if the client does not have write access.
     * 
     * @todo this is not using {@link #acquireExclusiveLock()} since I generally
     *       allocate the text index inside of another relation and
     *       {@link #acquireExclusiveLock()} is not reentrant for zookeeper.
     */
    /*
     * Note: BigdataRDFFullTextIndex overrides this method to setup IV support.
     */
    @Override
    public void create() {

        assertWritable();
        
        final String name = getNamespace() + "."+NAME_SEARCH;

        final IIndexManager indexManager = getIndexManager();

//        final IResourceLock resourceLock = acquireExclusiveLock();
//
//        try {

            /*
             * Register a tuple serializer that knows how to unpack the values and
             * how to extract the bytes corresponding to the encoded text (they can
             * not be decoded) from key and how to extract the document and field
             * identifiers from the key.
             */
            final Properties p = getProperties();
            
            final IndexMetadata indexMetadata = new IndexMetadata(indexManager,
                    p, name, UUID.randomUUID(), IndexTypeEnum.BTree);

            /*
             * Override the collator strength property to use the configured
             * value or the default for the text indexer rather than the
             * standard default. This is done because you typically want to
             * recognize only Primary differences for text search while you
             * often want to recognize more differences when generating keys for
             * a B+Tree.
             * 
             * Note: The choice of the language and country for the collator
             * should not matter much for this purpose since the total ordering
             * is not used except to scan all entries for a given term, so the
             * relative ordering between terms does not matter.
             */
            final IKeyBuilderFactory keyBuilderFactory;
            {
            
                final Properties tmp = new Properties(p);

                tmp.setProperty(KeyBuilder.Options.STRENGTH, p.getProperty(
                    Options.INDEXER_COLLATOR_STRENGTH,
                    Options.DEFAULT_INDEXER_COLLATOR_STRENGTH));

                keyBuilderFactory = new DefaultKeyBuilderFactory(tmp);

            }
    
            final boolean fieldsEnabled = Boolean.parseBoolean(p
                    .getProperty(Options.FIELDS_ENABLED,
                            Options.DEFAULT_FIELDS_ENABLED));
    
            if (log.isInfoEnabled())
                log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled);
    
//            final boolean doublePrecision = Boolean.parseBoolean(p
//                    .getProperty(Options.DOUBLE_PRECISION,
//                            Options.DEFAULT_DOUBLE_PRECISION));
//    
//            if (log.isInfoEnabled())
//                log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision);

            indexMetadata.setTupleSerializer(new FullTextIndexTupleSerializer(
                    keyBuilderFactory,//
                    DefaultTupleSerializer.getDefaultLeafKeysCoder(),//
                    EmptyRabaValueCoder.INSTANCE,//
                    fieldsEnabled//
            ));
            
            indexManager.registerIndex(indexMetadata);

            if (log.isInfoEnabled())
                log.info("Registered new text index: name=" + name);

            /*
             * Note: defer resolution of the index.
             */
//            ndx = getIndex(name);

//        } finally {
//
//            unlock(resourceLock);
//
//        }
        
    }

    public void destroy() {

        if (log.isInfoEnabled())
            log.info("");

        assertWritable();

        final IIndexManager indexManager = getIndexManager();

        final IResourceLock resourceLock = acquireExclusiveLock();

        try {

            indexManager.dropIndex(getNamespace() +"."+ NAME_SEARCH);

        } finally {

            unlock(resourceLock);

        }
        
    }

    /**
     * Return the token analyzer to be used for the given language code.
     * 
     * @param languageCode
     *            The language code or null to use the default
     *            {@link Locale}.
     * 
     * @return The token analyzer best suited to the indicated language family.
     */
    protected Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {

        return analyzerFactory.getAnalyzer(languageCode, filterStopwords);
        
    }

    /**
     * Return a {@link ThreadLocal} {@link IKeyBuilder} instance configured to
     * support full text indexing and search.
     * 
     * @see Options#INDEXER_COLLATOR_STRENGTH
     */
    protected final IKeyBuilder getKeyBuilder() {

        return getIndex().getIndexMetadata().getKeyBuilder();
            
    }

    /**
     * See {@link #index(TokenBuffer, long, int, String, Reader, boolean)}.
     * 
     * Uses a default filterStopwords value of true.
     */
    public void index(final TokenBuffer buffer, final V docId,
            final int fieldId, final String languageCode, final Reader r) {
    	
    	index(buffer, docId, fieldId, languageCode, r, true/* filterStopwords */);
    	
    }
    
    /**
     * Index a field in a document.
     * 

     * Note: This method does NOT force a write on the indices. If the buffer
     * overflows, then there will be an index write. Once the caller is done
     * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
     * remaining in their buffer to the indices.
     * 

     * Note: If a document is pre-existing, then the existing data for that
     * document MUST be removed unless you know that the fields to be found in
     * the will not have changed (they may have different contents, but the same
     * fields exist in the old and new versions of the document).
     * 
     * @param buffer
     *            Used to buffer writes onto the text index.
     * @param docId
     *            The document identifier.
     * @param fieldId
     *            The field identifier.
     * @param languageCode
     *            The language code -or- null to use the default
     *            {@link Locale}.
     * @param r
     *            A reader on the text to be indexed.
     * @param filterStopwords
     * 			  if true, filter stopwords from the token stream            
     * 
     * @see TokenBuffer#flush()
     */
    public void index(final TokenBuffer buffer, final V docId,
            final int fieldId, final String languageCode, final Reader r,
            final boolean filterStopwords) {

        /*
         * Note: You can invoke this on a read-only index. It is only overflow
         * of the TokenBuffer that requires a writable index. Overflow itself
         * will only occur on {document,field} tuple boundaries, so it will
         * never overflow when indexing a search query.
         */
//        assertWritable();
        
        int n = 0;
        
        // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
        final TokenStream tokenStream = getTokenStream(languageCode, r,
                filterStopwords);

        
        try {

            tokenStream.reset();   

            while (tokenStream.incrementToken()) {
                
                final CharTermAttribute term = tokenStream
                        .getAttribute(CharTermAttribute.class);
                
                buffer.add(docId, fieldId, term.toString());

                n++;

            }

        } catch (IOException ioe) {
            
            throw new RuntimeException(ioe);
            
        }

        if (log.isInfoEnabled())
            log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId="
                    + fieldId);

    }

    /**
     * Tokenize text using an {@link Analyzer} that is appropriate to the
     * specified language family.
     * 
     * @param languageCode
     *            The language code -or- null to use the default
     *            {@link Locale}).
     * 
     * @param r
     *            A reader on the text to be indexed.
     *            
     * @param filterStopwords
     * 			  if true, filter stopwords from the token stream            
     * 
     * @return The extracted token stream.
     */
    protected TokenStream getTokenStream(final String languageCode, 
    		final Reader r, final boolean filterStopwords) {

        /*
         * Note: This is stripping out stopwords by default.
         * 
         * @todo is it using a language family specific stopword list?
         */
        final Analyzer a = getAnalyzer(languageCode, filterStopwords);
        
        TokenStream tokenStream;
		tokenStream = a.tokenStream(null/* @todo field? */, r);

		// force to lower case.
		tokenStream = new LowerCaseFilter(tokenStream);

		return tokenStream;
    }

    /**
     * Performs a full text search against indexed documents returning a hit
     * list.
     * 

     * The basic algorithm computes cosine between the term-frequency vector of
     * the query and the indexed "documents". The cosine may be directly
     * interpreted as the "relevance" of a "document" to the query. The query
     * and document term-frequency vectors are normalized, so the cosine values
     * are bounded in [0.0:1.0]. The higher the cosine the more relevant the
     * document is to the query. A cosine of less than .4 is rarely of any
     * interest.
     * 

     * The implementation creates and runs a set parallel tasks, one for each
     * distinct token found in the query, and waits for those tasks to complete
     * or for a timeout to occur. Each task uses a key-range scan on the terms
     * index, collecting metadata for the matching "documents" and aggregating
     * it on a "hit" for that document. Since the tasks run concurrently, there
     * are concurrent writers on the "hits". On a timeout, the remaining tasks
     * are interrupted.
     * 

     * The collection of hits is scored and hits that fail a threshold are
     * discarded. The remaining hits are placed into a total order and the
     * caller is returned an iterator which can read from that order. If the
     * operation is interrupted, then only those {@link IHit}s that have already
     * been computed will be returned.
     * 
     * @param query
     *            The query (it will be parsed into tokens).
     * @param languageCode
     *            The language code that should be used when tokenizing the
     *            query -or- null to use the default {@link Locale}
     *            ).
     * @param minCosine
     *            The minimum cosine that will be returned.
     * @param maxCosine
     *            The maximum cosine that will be returned.
	 * @param minRank
	 *            The min rank of the search result.
	 * @param maxRank
	 *            The max rank of the search result.
     * @param prefixMatch
     *            When true, the matches will be on tokens which
     *            include the query tokens as a prefix. This includes exact
     *            matches as a special case when the prefix is the entire token,
     *            but it also allows longer matches. For example,
     *            free will be an exact match on free
     *            but a partial match on freedom. When
     *            false, only exact matches will be made.
     * @param matchAllTerms
     * 			  if true, return only hits that match all search terms
     * @param timeout
     *            The timeout -or- ZERO (0) for NO timeout (this is equivalent
     *            to using {@link Long#MAX_VALUE}).
     * @param unit
     *            The unit in which the timeout is expressed.
     * 
     * @return The hit list.
     * 
     * @todo Allow search within field(s). This will be a filter on the range
     *       iterator that is sent to the data service such that the search
     *       terms are visited only when they occur in the matching field(s).
     */
    public Hiterator> search(final FullTextQuery query) {
        
		final Hit[] a = _search(query);
    	
        return new Hiterator>(a);

    }
    
    /**
     * Perform a range count on a full text query.
     */
    public int count(final FullTextQuery query) {
    	
    	if (cache.containsKey(query)) {
    		
        	if (log.isInfoEnabled())
        		log.info("found hits in cache");
        	
    		return cache.get(query).length;
    		
        } else {
        	
        	if (log.isInfoEnabled())
        		log.info("did not find hits in cache");
        	
        }
    	
        // tokenize the query.
        final TermFrequencyData qdata = tokenize(query);
        
        // No terms after stopword extraction
        if (qdata == null) {
        	
        	cache.put(query, new Hit[] {});
        	
        	return 0;
        			
        }
        
        /*
         * We can run an optimized version of this (just a quick range count)
         * but only if the caller does not care about exact match and has
         * not specified a regex.
         */
        if (qdata.distinctTermCount() == 1 &&
        		!query.isMatchExact() && query.getMatchRegex() == null) {
        	
        	final boolean prefixMatch = query.isPrefixMatch();
        	
        	final Map.Entry e = qdata.getSingletonEntry();
        	
            final String termText = e.getKey();
        	
            final ITermMetadata md = e.getValue();

            final CountIndexTask task1 = new CountIndexTask(termText, 0, 1, 
            		prefixMatch, md.getLocalTermWeight(), this);
            
            return (int) task1.getRangeCount();
        	
        } else {
	        
			final Hit[] a = _search(query);
	    	
			return a.length;
			
        }
		
    }
    
    protected TermFrequencyData tokenize(final FullTextQuery query) {
    	
		final String q = query.getQuery();
		final String languageCode = query.getLanguageCode(); 
		final boolean prefixMatch = query.isPrefixMatch();

        // tokenize the query.
        final TermFrequencyData qdata;
        {
            
    		final TokenBuffer buffer = new TokenBuffer(1, this);

            /*
             * If we are using prefix match ('*' operator) then we don't want to
             * filter stopwords from the search query.
             */
            final boolean filterStopwords = !prefixMatch;

            index(buffer, //
                    null, // docId // was Long.MIN_VALUE
                    Integer.MIN_VALUE, // fieldId
                    languageCode,//
                    new StringReader(q), //
                    filterStopwords//
            );

            if (buffer.size() == 0) {

                /*
                 * There were no terms after stopword extration.
                 */

                log.warn("No terms after stopword extraction: query=" + query);

                return null; 

            }
            
            qdata = buffer.get(0);
            
            qdata.normalize();
            
        }
        
        return qdata;

    }
    
    public Hit[] _search(final FullTextQuery query) {
    	
		final String queryStr = query.getQuery();
		final String languageCode = query.getLanguageCode(); 
		final boolean prefixMatch = query.isPrefixMatch();
        final double minCosine = query.getMinCosine();
        final double maxCosine = query.getMaxCosine();
        final int minRank = query.getMinRank();
        final int maxRank = query.getMaxRank(); 
        final boolean matchAllTerms = query.isMatchAllTerms();
        final boolean matchExact = query.isMatchExact();
        final String regex = query.getMatchRegex();
        long timeout = query.getTimeout();
        final TimeUnit unit = query.getTimeUnit(); 

        final long begin = System.currentTimeMillis();
        
//        if (languageCode == null)
//            throw new IllegalArgumentException();

        if (queryStr == null)
            throw new IllegalArgumentException();
        
        if (minCosine < 0d || minCosine > 1d)
            throw new IllegalArgumentException();

        if (minRank <= 0 || maxRank <= 0)
            throw new IllegalArgumentException();

        if (minRank > maxRank)
            throw new IllegalArgumentException();

        if (timeout < 0L)
            throw new IllegalArgumentException();
        
        if (unit == null)
            throw new IllegalArgumentException();
        
        if (log.isInfoEnabled())
            log.info("languageCode=[" + languageCode + "], text=[" + queryStr
                    + "], minCosine=" + minCosine 
                    + ", maxCosine=" + maxCosine
                    + ", minRank=" + minRank
                    + ", maxRank=" + maxRank
                    + ", matchAllTerms=" + matchAllTerms
                    + ", prefixMatch=" + prefixMatch
                    + ", timeout=" + timeout + ", unit=" + unit);

        if (timeout == 0L) {

            // treat ZERO as equivalent to MAX_LONG.
            timeout = Long.MAX_VALUE;
            
        }
        
        final FullTextQuery cacheKey = query;
        
        Hit[] a;
        
        if (cache.containsKey(cacheKey)) {
        	
        	if (log.isInfoEnabled())
        		log.info("found hits in cache");
        	
        	a = cache.get(cacheKey);
        	
        } else {
        	
        	if (log.isInfoEnabled())
        		log.info("did not find hits in cache");
        	
            // tokenize the query.
            final TermFrequencyData qdata = tokenize(query);
            
            // No terms after stopword extraction
            if (qdata == null) {
            	
            	cache.put(cacheKey, a = new Hit[] {});
            	
            	return a;
            			
            }
            
            a = executeQuery(qdata, prefixMatch, timeout, unit);
            
	        if (a.length == 0) {
	        	
	            log.info("No hits: languageCode=[" + languageCode + "], query=["
	                    + queryStr + "]");
	            
	            cache.put(cacheKey, a);
	            
	            return a; 
	            
	        }
	
	        /*
	         * If match all is specified, remove any hits with a term count less
	         * than the number of search tokens.  It's also an optimization to
	         * run the pruning if we're going to do matchExact.
	         */
	        if ((matchAllTerms || matchExact) && qdata.distinctTermCount() > 1) {
	        	
		        final int nterms = qdata.terms.size();
		        
		        if (log.isInfoEnabled()) {
		        	log.info("matchAll=true, nterms=" + nterms);
		        	log.info("size before: " + a.length);
		        }
		        
	        	final Hit[] tmp = new Hit[a.length];
	        	
	        	int i = 0;
	        	for (Hit hit : a) {
	        		
	        		if (hit.getTermCount() == nterms) {
	        			tmp[i++] = hit;
	        		}
	        		
	        	}
	        	
	        	if (log.isDebugEnabled()) {
	        		log.debug(i);
	        	}
	        	
	        	if (i < a.length) {
	        		
	        		a = new Hit[i];
	        		System.arraycopy(tmp, 0, a, 0, i);
	        		
	        	}
	        	
	        }
	        
	        /*
	         * Delegate match exact to subclasses.
	         */
	        if (matchExact) {
	        	
	        	a = matchExact(a, queryStr);
	        	
	        }
	        
	        if (a.length == 0) {
	        	
	            log.warn("No hits after matchAllTerms pruning: languageCode=[" + languageCode + "], query=["
	                    + queryStr + "]");
	            
	            cache.put(cacheKey, a);
	            
	            return a; 
	            
	        }
	        
	        /*
	         * Do regex matching.
	         */
	        if (regex != null) {
	        	
	        	final Pattern pattern = Pattern.compile(regex);//, Pattern.CASE_INSENSITIVE);
	        	
	        	if (log.isDebugEnabled()) {
	        		log.debug("hits before regex: " + a.length);
	        	}
	        	
	        	a = applyRegex(a, pattern);
	        	
	        	if (log.isDebugEnabled()) {
	        		log.debug("hits after regex: " + a.length);
	        	}
	        	
	        }
	        
	        if (a.length == 0) {
	        	
	            log.warn("No hits after regex pruning: languageCode=[" + languageCode + "], query=["
	                    + queryStr + "], regex=[" + regex + "]");
	            
	            cache.put(cacheKey, a);
	            
	            return a; 
	            
	        }
	
	        /*
	         * Rank order the hits by relevance.
	         * 
	         * @todo consider moving documents through a succession of N pools where
	         * N is the #of distinct terms in the query. The read tasks would halt
	         * if the size of the pool for N terms reached maxRank. This might (or
	         * might not) help with triage since we could process hits by pool and
	         * only compute the cosines for one pool at a time until we had enough
	         * hits.
	         */
	        
	        if (log.isInfoEnabled())
	            log.info("Rank ordering "+a.length+" hits by relevance");
	        
	        final long start = System.currentTimeMillis();
	        
	        Arrays.sort(a);
	        
	        if (log.isInfoEnabled()) {
	        	final long sortTime = System.currentTimeMillis() - start;
	        	log.info("sort time: " + sortTime);
	        }
	        
	        for (int i = 0; i < a.length; i++) {
	        	a[i].setRank(i+1);
	        }
	        
	        cache.put(cacheKey, a);
        
        }
        
        /*
         * Take a slice of the hits based on min/max cosine and min/max rank.
         */
        a = slice(query, a);
        
        final long elapsed = System.currentTimeMillis() - begin;
        
        if (log.isInfoEnabled())
            log.info("Done: " + a.length + " hits in " + elapsed + "ms");

        return a;
        
    }
    
    protected Hit[] slice(final FullTextQuery query, Hit[] a) {
    	
        final double minCosine = query.getMinCosine();
        final double maxCosine = query.getMaxCosine();
        final int minRank = query.getMinRank();
        final int maxRank = query.getMaxRank();
        
//        if (log.isDebugEnabled()) {
//        	log.debug("before min/max cosine/rank pruning:");
//        	for (Hit h : a)
//        		log.debug(h);
//        }

        /*
         * If maxCosine is specified, prune the hits that are above the max
         */
        if (maxCosine < 1.0d) {

        	// find the first occurrence of a hit that is <= maxCosine
        	int i = 0;
        	for (Hit h : a) {
        		if (h.getCosine() <= maxCosine)
        			break;
        		i++;
        	}
        	
        	// no hits with relevance less than maxCosine
        	if (i == a.length) {
        		
        		return new Hit[] {};
        		
        	} else {
        	
	        	// copy the hits from that first occurrence to the end
	        	final Hit[] tmp = new Hit[a.length - i];
	        	System.arraycopy(a, i, tmp, 0, tmp.length);
	        	
	        	a = tmp;
	        	
        	}
        	
        }

        /*
         * If minCosine is specified, prune the hits that are below the min
         */
        if (minCosine > 0.0d) {

        	// find the first occurrence of a hit that is < minCosine
        	int i = 0;
        	for (Hit h : a) {
        		if (h.getCosine() < minCosine)
        			break;
        		i++;
        	}
        	
        	// no hits with relevance greater than minCosine
        	if (i == 0) {
        		
        		return new Hit[] {};
        		
        	} else if (i < a.length) {
        	
	        	// copy the hits from 0 up to that first occurrence
	        	final Hit[] tmp = new Hit[i];
	        	System.arraycopy(a, 0, tmp, 0, tmp.length);
	        	
	        	a = tmp;
	        	
        	}
        	
        }

        // exactly one hit
        if (minRank > 0 && minRank == maxRank) {
        	
        	if (minRank > a.length) {
        		
            	// out of range
        		return new Hit[] {};
        		
        	} else {

        		// in range
        		return new Hit[] { a[minRank-1] };
	        	
        	}
        	
        }
        
        /*
         * If minRank is specified, prune the hits that rank higher than the min
         */
        if (minRank > 1) {

        	// no hits above minRank
        	if (minRank > a.length) {
        		
        		return new Hit[] {};
        		
        	} else {
        	
	        	// copy the hits from the minRank to the end
	        	final Hit[] tmp = new Hit[a.length - (minRank-1)];
	        	System.arraycopy(a, minRank-1, tmp, 0, tmp.length);
	        	
	        	a = tmp;
	        	
        	}
        	
        }

        final int newMax = maxRank-minRank+1;
        
        if (log.isDebugEnabled())
        	log.debug("new max rank: " + newMax);
        
        /*
         * If maxRank is specified, prune the hits that rank lower than the max
         */
        if (newMax < a.length) {

        	// copy the hits from the minRank to the end
        	final Hit[] tmp = new Hit[newMax];
        	System.arraycopy(a, 0, tmp, 0, tmp.length);
        	
        	a = tmp;
        	
        }

        return a;
        
    }
    
    protected Hit[] executeQuery(final TermFrequencyData qdata,
    		final boolean prefixMatch, final long timeout, final TimeUnit unit) {
    	
        final IHitCollector hits;
        
        if (qdata.distinctTermCount() == 1) {
        	
        	final Map.Entry e = qdata.getSingletonEntry();
        	
            final String termText = e.getKey();
        	
            final ITermMetadata md = e.getValue();

            final CountIndexTask task1 = new CountIndexTask(termText, 0, 1,
            		prefixMatch, md.getLocalTermWeight(), this);
            
            hits = new SingleTokenHitCollector(task1);
        	
        } else {
        	
            final List> tasks = new ArrayList>(
                    qdata.distinctTermCount());

            int i = 0;
            for (Map.Entry e : qdata.terms.entrySet()) {

                final String termText = e.getKey();

                final ITermMetadata md = e.getValue();

                tasks.add(new CountIndexTask(termText, i++, qdata.terms.size(), 
                		prefixMatch, md.getLocalTermWeight(), this));

            }
            
            hits = new MultiTokenHitCollector(tasks);
        	
        }
        
        // run the queries.
        {

            final List> tasks = new ArrayList>(
                    qdata.distinctTermCount());

            int i = 0;
            for (Map.Entry e : qdata.terms.entrySet()) {

                final String termText = e.getKey();

                final ITermMetadata md = e.getValue();

                tasks.add(new ReadIndexTask(termText, i++, qdata.terms.size(),
                		prefixMatch, md.getLocalTermWeight(), this, hits));

            }

            final ExecutionHelper