com.bigdata.search.FullTextIndex Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jan 23, 2008
*/
package com.bigdata.search;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexTypeEnum;
import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.btree.raba.codec.EmptyRabaValueCoder;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.IResourceLock;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.rdf.lexicon.ITextIndexer.FullTextQuery;
import com.bigdata.relation.AbstractRelation;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.concurrent.ExecutionHelper;
/**
* Full text indexing and search support.
*
* The basic data model consists of documents, fields in documents, and tokens
* extracted by an analyzer from those fields.
*
* The frequency distributions may be normalized to account for a variety of
* effects producing "term weights". For example, normalizing for document
* length or relative frequency of a term in the overall collection. Therefore
* the logical model is:
*
*
*
* token : {docId, freq?, weight?}+
*
*
*
* (For RDF, docId is the term identifier as assigned by the term:id index.)
*
* The freq and weight are optional values that are representative of the kinds
* of statistical data that are kept on a per-token-document basis. The freq is
* the token frequency (the frequency of occurrence of the token in the
* document). The weight is generally a normalized token frequency weight for
* the token in that document in the context of the overall collection.
*
* In fact, we actually represent the data as follows:
*
*
*
* {sortKey(token), weight, docId, fldId} : {freq?, sorted(pos)+}
*
*
*
* That is, there is a distinct entry in the full text B+Tree for each field in
* each document in which a given token was recognized. The text of the token is
* not stored in the key, just the Unicode sort key generated from the token
* text. The value associated with the B+Tree entry is optional - it is simply
* not used unless we are storing statistics for the token-document pair. The
* advantages of this approach are: (a) it reuses the existing B+Tree data
* structures efficiently; (b) we are never faced with the possibility overflow
* when a token is used in a large number of documents. The entries for the
* token will simply be spread across several leaves in the B+Tree; (c) leading
* key compression makes the resulting B+Tree very efficient; and (d) in a
* scale-out range partitioned index we can load balance the resulting index
* partitions by choosing the partition based on an even token boundary.
*
* A field is any pre-identified text container within a document. Field
* identifiers are integers, so there are 32^2
distinct possible
* field identifiers. It is possible to manage the field identifiers through a
* secondary index, but that has no direct bearing on the structure of the full
* text index itself. Field identifies appear after the token in the key so that
* queries may be expressed that will be matched against any field in the
* document. Likewise, field identifiers occur before the document identifier in
* the key since we always search across documents (in a search key, the
* document identifier is always {@link Long#MIN_VALUE} and the field identifier
* is always {@link Integer#MIN_VALUE}). There are many applications for fields:
* for example, distinct fields may be used for the title, abstract, and full
* text of a document or for the CDATA section of each distinct element in
* documents corresponding to some DTD. The application is responsible for
* recognizing the fields in the document and producing the appropriate token
* stream, each of which must be tagged by the field.
*
* A query is tokenized, producing a (possibly normalized) token-frequency
* vector. The relevance of documents to the query is generally taken as the
* cosine between the query's and each document's (possibly normalized)
* token-frequency vectors. The main effort of search is assembling a token
* frequency vector for just those documents with which there is an overlap with
* the query. This is done using a key range scan for each token in the query
* against the full text index.
*
*
* fromKey := token, Long.MIN_VALUE
* toKey := successor(token), Long.MIN_VALUE
*
*
* and extracting the appropriate token frequency, normalized token weight, or
* other statistic. When no value is associated with the entry we follow the
* convention of assuming a token frequency of ONE (1) for each document in
* which the token appears.
*
* Tokenization is informed by the language code (when declared) and by the
* configured {@link Locale} for the database otherwise. An appropriate
* {@link Analyzer} is chosen based on the language code or {@link Locale} and
* the "document" is broken into a token-frequency distribution (alternatively a
* set of tokens). The same process is used to tokenize queries, and the API
* allows the caller to specify the language code used to select the
* {@link Analyzer} to tokenize the query.
*
* Once the tokens are formed the language code / {@link Locale} used to produce
* the token is discarded (it is not represented in the index). The reason for
* this is that we never utilize the total ordering of the full text index,
* merely the manner in which it groups tokens that map onto the same Unicode
* sort key together. Further, we use only a single Unicode collator
* configuration regardless of the language family in which the token was
* originally expressed. Unlike the collator used by the terms index (which
* often is set at IDENTICAL strength), the collector used by the full text
* index should be chosen such that it makes relatively few distinctions in
* order to increase recall (e.g., set at PRIMARY strength). Since a total order
* over the full text index is not critical from the perspective of its IR
* application, the {@link Locale} for the collator is likewise not critical and
* PRIMARY strength will produce significantly shorter Unicode sort keys.
*
* The term frequency within that literal is an optional property associated
* with each term identifier, as is the computed weight for the token in the
* term.
*
* Note: Documents should be tokenized using an {@link Analyzer} appropriate for
* their declared language code (if any). However, once tokenized, the language
* code is discarded and we perform search purely on the Unicode sort keys
* resulting from the extracted tokens.
*
Scale-out
*
* Because the first component in the key is the token, both updates (when
* indexing document) and queries (reading against different tokens) will be
* scattered across shards. Therefore it is not necessary to register a split
* handler for the full text index.
*
* @todo The key for the terms index is {term,docId,fieldId}. Since the data are
* not pre-aggregated by {docId,fieldId} we can not easily remove only
* those tuples corresponding to some document (or some field of some
* document).
*
* In order to removal of the fields for a document we need to know either
* which fields were indexed for the document and the tokens found in
* those fields and then scatter the removal request (additional space
* requirements) or we need to flood a delete procedure across the terms
* index (expensive).
*
* @todo provide M/R alternatives for indexing or computing/updating global
* weights.
*
* @todo Consider model in which fields are declared and then a "Document" is
* indexed. This lets us encapsulate the "driver" for indexing. The
* "field" can be a String or a Reader, etc.
*
* Note that lucene handles declaration of the data that will be stored
* for a field on a per Document basis {none, character offsets, character
* offsets + token positions}. There is also an option to store the term
* vector itself. Finally, there are options to store, compress+store, or
* not store the field value. You can also choose {None, IndexTokenized,
* IndexUntokenized} and an option dealing with norms.
*
* @todo lucene {@link Analyzer}s may be problematic. For example, it is
* difficult to tokenize numbers. consider replacing the lucene
* analyzer/tokenizer with our own stuff. this might help with
* tokenization of numbers, etc. and with tokenization of native html or
* xml with intact offsets.
*
* @todo lucene analyzers will strip stopwords by default. There should be a
* configuration option to strip out stopwords and another to enable
* stemming. how we do that should depend on the language family.
* Likewise, there should be support for language family specific stopword
* lists and language family specific exclusions.
*
* @todo support more term weighting schemes and make them easy to configure.
*
* @param
* The generic type of the document identifier.
* @author Bryan Thompson
* @version $Id$
*/
public class FullTextIndex> extends AbstractRelation {
final private static transient Logger log = Logger
.getLogger(FullTextIndex.class);
/**
* The backing index.
*/
volatile private IIndex ndx;
/**
* The index used to associate term identifiers with tokens parsed from
* documents.
*/
public IIndex getIndex() {
if(ndx == null) {
synchronized (this) {
ndx = getIndex(getNamespace() + "." + NAME_SEARCH);
if (ndx == null)
throw new IllegalStateException();
}
}
return ndx;
}
/**
* Options understood by the {@link FullTextIndex}.
*
* @author Bryan Thompson
*/
public interface Options {
/**
* indexer.overwrite
- boolean option (default
* true
) controls the behavior when a write is requested
* on the index and the {term,doc,field} tuple which forms the key is
* already present in the index. When true
, the new
* value will be written on the index. When false
, the
* existing value will be retained. This option is an optimization which
* makes sense when the corpus (a) only grows; and (b) the content of
* the documents in the corpus never changes. For example, this is true
* for an RDF database since the set of terms only grows and each term
* is immutable.
*/
String OVERWRITE = FullTextIndex.class.getName() + ".overwrite";
String DEFAULT_OVERWRITE = "true";
/**
* Specify the collator {@link StrengthEnum strength} for the full-text
* index (default {@value StrengthEnum#Primary}).
*
* Note: {@link StrengthEnum#Primary} is generally what you want for a
* full text index as search will consider tokens which differ in case
* and other subtle features to be the same token (a 'match').
*
* @see KeyBuilder.Options#STRENGTH
*/
String INDEXER_COLLATOR_STRENGTH = FullTextIndex.class.getName()
+ ".collator.strength";
String DEFAULT_INDEXER_COLLATOR_STRENGTH = StrengthEnum.Primary.toString();
/**
* The maximum time in milliseconds that the search engine will await
* completion of the tasks reading on each of the query terms (default
* {@value #DEFAULT_INDEXER_TIMEOUT}). A value of ZERO (0) means NO
* timeout and is equivalent to a value of {@link Long#MAX_VALUE}. If
* the timeout expires before all tasks complete then the search results
* will only reflect partial information.
*/
String INDEXER_TIMEOUT = FullTextIndex.class.getName() + ".timeout";
String DEFAULT_INDEXER_TIMEOUT = "0";
/**
* When true
, the fieldId
is stored as part of
* the key (default {@value #DEFAULT_FIELDS_ENABLED}). When
* false
, each key will be four bytes shorter. Applications
* which do not use fieldId
are should disable it when
* creating the {@link FullTextIndex}.
*/
String FIELDS_ENABLED = FullTextIndex.class.getName()
+ ".fieldsEnabled";
String DEFAULT_FIELDS_ENABLED = "false";
// /**
// * When true
, the localTermWeight
is stored
// * using double-precision. When false
, it is stored using
// * single-precision.
// */
// String DOUBLE_PRECISION = FullTextIndex.class.getName()
// + ".doublePrecision";
//
// String DEFAULT_DOUBLE_PRECISION = "false";
/**
* The name of the {@link IAnalyzerFactory} class which will be used to
* obtain analyzers when tokenizing documents and queries (default
* {@value #DEFAULT_ANALYZER_FACTORY_CLASS}). The specified class MUST
* implement {@link IAnalyzerFactory} and MUST have a constructor with
* the following signature:
*
* public MyAnalyzerFactory(FullTextIndexer indexer)
*
*/
String ANALYZER_FACTORY_CLASS = FullTextIndex.class.getName()
+ ".analyzerFactoryClass";
String DEFAULT_ANALYZER_FACTORY_CLASS = DefaultAnalyzerFactory.class.getName();
/**
* We keep a small hit cache based on search parameters: search string +
* prefixMatch + matchAllTerms. This defines the size of that cache.
* The value should remain small.
*/
String HIT_CACHE_SIZE = FullTextIndex.class.getName()
+ ".hitCacheSize";
String DEFAULT_HIT_CACHE_SIZE = "10";
/**
* We keep a small hit cache based on search parameters: search string +
* prefixMatch + matchAllTerms. This defines the timeout for values in
* that cache (in milliseconds). The value should remain small.
*/
String HIT_CACHE_TIMEOUT_MILLIS = FullTextIndex.class.getName()
+ ".hitCacheTimeoutMillis";
/**
* Default is 1 minute.
*/
String DEFAULT_HIT_CACHE_TIMEOUT_MILLIS =
String.valueOf(TimeUnit.MINUTES.toMillis(1));
}
/**
* @see Options#OVERWRITE
*/
private final boolean overwrite;
/**
* Return the value configured by the {@link Options#OVERWRITE} property.
*/
public boolean isOverwrite() {
return overwrite;
}
/**
* @see Options#INDEXER_TIMEOUT
*/
private final long timeout;
// /**
// * @see Options#FIELDS_ENABLED
// */
// private final boolean fieldsEnabled;
//
// /**
// * @see Options#DOUBLE_PRECISION
// */
// private final boolean doublePrecision;
//
// /**
// * Return the value configured by the {@link Options#FIELDS_ENABLED}
// * property.
// */
// public boolean isFieldsEnabled() {
//
// return fieldsEnabled;
//
// }
/**
* @see Options#ANALYZER_FACTORY_CLASS
*/
private final IAnalyzerFactory analyzerFactory;
/**
* See {@link Options#HIT_CACHE_SIZE}.
*/
private final int hitCacheSize;
/**
* See {@link Options#HIT_CACHE_TIMEOUT_MILLIS}.
*/
private final long hitCacheTimeoutMillis;
/**
* See {@link Options#HIT_CACHE_SIZE}.
*/
private final ConcurrentWeakValueCacheWithTimeout[]> cache;
// /**
// * @see Options#DOCID_FACTORY_CLASS
// */
// private final IKeyBuilderExtension docIdFactory;
// /**
// * The concrete {@link IRecordBuilder} instance.
// */
// private final IRecordBuilder recordBuilder;
//
// /**
// * Return the object responsible for encoding and decoding the tuples
// * in the full text index.
// */
// public final IRecordBuilder getRecordBuilder() {
//
// return recordBuilder;
//
// }
/**
* The basename of the search index.
*/
public static final transient String NAME_SEARCH = "search";
/**
* true
unless {{@link #getTimestamp()} is {@link ITx#UNISOLATED}.
*/
final public boolean isReadOnly() {
return TimestampUtility.isReadOnly(getTimestamp());
}
// protected void assertWritable() {
//
// if(isReadOnly()) {
//
// throw new IllegalStateException("READ_ONLY");
//
// }
//
// }
/**
* Ctor specified by {@link DefaultResourceLocator}.
*
* @param client
* The client. Configuration information is obtained from the
* client. See {@link Options}.
*
* @see Options
*/
public FullTextIndex(final IIndexManager indexManager,
final String namespace, final Long timestamp,
final Properties properties) {
super(indexManager, namespace, timestamp, properties);
{
overwrite = Boolean.parseBoolean(properties.getProperty(
Options.OVERWRITE, Options.DEFAULT_OVERWRITE));
if (log.isInfoEnabled())
log.info(Options.OVERWRITE + "=" + overwrite);
}
{
timeout = Long.parseLong(properties.getProperty(
Options.INDEXER_TIMEOUT, Options.DEFAULT_INDEXER_TIMEOUT));
if (log.isInfoEnabled())
log.info(Options.INDEXER_TIMEOUT + "=" + timeout);
}
// {
//
// fieldsEnabled = Boolean.parseBoolean(properties.getProperty(
// Options.FIELDS_ENABLED, Options.DEFAULT_FIELDS_ENABLED));
//
// if (log.isInfoEnabled())
// log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled);
//
// }
//
// {
//
// doublePrecision = Boolean
// .parseBoolean(properties.getProperty(
// Options.DOUBLE_PRECISION,
// Options.DEFAULT_DOUBLE_PRECISION));
//
// if (log.isInfoEnabled())
// log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision);
//
// }
{
hitCacheSize = Integer.parseInt(properties.getProperty(
Options.HIT_CACHE_SIZE, Options.DEFAULT_HIT_CACHE_SIZE));
if (log.isInfoEnabled())
log.info(Options.HIT_CACHE_SIZE + "=" + hitCacheSize);
}
{
hitCacheTimeoutMillis = Long.parseLong(properties.getProperty(
Options.HIT_CACHE_TIMEOUT_MILLIS,
Options.DEFAULT_HIT_CACHE_TIMEOUT_MILLIS));
if (log.isInfoEnabled())
log.info(Options.HIT_CACHE_TIMEOUT_MILLIS + "=" + hitCacheTimeoutMillis);
}
this.cache =
new ConcurrentWeakValueCacheWithTimeout[]>(
hitCacheSize, hitCacheTimeoutMillis);
{
final String className = getProperty(
Options.ANALYZER_FACTORY_CLASS,
Options.DEFAULT_ANALYZER_FACTORY_CLASS);
if (log.isInfoEnabled())
log.info(Options.ANALYZER_FACTORY_CLASS + "=" + className);
final Class cls;
try {
cls = (Class) Class.forName(className);
} catch (ClassNotFoundException e) {
throw new RuntimeException("Bad option: "
+ Options.ANALYZER_FACTORY_CLASS, e);
}
if (!IAnalyzerFactory.class.isAssignableFrom(cls)) {
throw new RuntimeException(Options.ANALYZER_FACTORY_CLASS
+ ": Must extend: " + IAnalyzerFactory.class.getName());
}
try {
final Constructor extends IAnalyzerFactory> ctor = cls
.getConstructor(new Class[] { FullTextIndex.class });
// save reference.
analyzerFactory = ctor.newInstance(new Object[] { this });
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
/*
* Note: defer resolution of the index.
*/
// // resolve index (might not exist, in which case this will be null).
// ndx = getIndex(getNamespace()+"."+NAME_SEARCH);
}
/**
* Conditionally registers the necessary index(s).
*
* @throws IllegalStateException
* if the client does not have write access.
*
* @todo this is not using {@link #acquireExclusiveLock()} since I generally
* allocate the text index inside of another relation and
* {@link #acquireExclusiveLock()} is not reentrant for zookeeper.
*/
/*
* Note: BigdataRDFFullTextIndex overrides this method to setup IV support.
*/
@Override
public void create() {
assertWritable();
final String name = getNamespace() + "."+NAME_SEARCH;
final IIndexManager indexManager = getIndexManager();
// final IResourceLock resourceLock = acquireExclusiveLock();
//
// try {
/*
* Register a tuple serializer that knows how to unpack the values and
* how to extract the bytes corresponding to the encoded text (they can
* not be decoded) from key and how to extract the document and field
* identifiers from the key.
*/
final Properties p = getProperties();
final IndexMetadata indexMetadata = new IndexMetadata(indexManager,
p, name, UUID.randomUUID(), IndexTypeEnum.BTree);
/*
* Override the collator strength property to use the configured
* value or the default for the text indexer rather than the
* standard default. This is done because you typically want to
* recognize only Primary differences for text search while you
* often want to recognize more differences when generating keys for
* a B+Tree.
*
* Note: The choice of the language and country for the collator
* should not matter much for this purpose since the total ordering
* is not used except to scan all entries for a given term, so the
* relative ordering between terms does not matter.
*/
final IKeyBuilderFactory keyBuilderFactory;
{
final Properties tmp = new Properties(p);
tmp.setProperty(KeyBuilder.Options.STRENGTH, p.getProperty(
Options.INDEXER_COLLATOR_STRENGTH,
Options.DEFAULT_INDEXER_COLLATOR_STRENGTH));
keyBuilderFactory = new DefaultKeyBuilderFactory(tmp);
}
final boolean fieldsEnabled = Boolean.parseBoolean(p
.getProperty(Options.FIELDS_ENABLED,
Options.DEFAULT_FIELDS_ENABLED));
if (log.isInfoEnabled())
log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled);
// final boolean doublePrecision = Boolean.parseBoolean(p
// .getProperty(Options.DOUBLE_PRECISION,
// Options.DEFAULT_DOUBLE_PRECISION));
//
// if (log.isInfoEnabled())
// log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision);
indexMetadata.setTupleSerializer(new FullTextIndexTupleSerializer(
keyBuilderFactory,//
DefaultTupleSerializer.getDefaultLeafKeysCoder(),//
EmptyRabaValueCoder.INSTANCE,//
fieldsEnabled//
));
indexManager.registerIndex(indexMetadata);
if (log.isInfoEnabled())
log.info("Registered new text index: name=" + name);
/*
* Note: defer resolution of the index.
*/
// ndx = getIndex(name);
// } finally {
//
// unlock(resourceLock);
//
// }
}
public void destroy() {
if (log.isInfoEnabled())
log.info("");
assertWritable();
final IIndexManager indexManager = getIndexManager();
final IResourceLock resourceLock = acquireExclusiveLock();
try {
indexManager.dropIndex(getNamespace() +"."+ NAME_SEARCH);
} finally {
unlock(resourceLock);
}
}
/**
* Return the token analyzer to be used for the given language code.
*
* @param languageCode
* The language code or null
to use the default
* {@link Locale}.
*
* @return The token analyzer best suited to the indicated language family.
*/
protected Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {
return analyzerFactory.getAnalyzer(languageCode, filterStopwords);
}
/**
* Return a {@link ThreadLocal} {@link IKeyBuilder} instance configured to
* support full text indexing and search.
*
* @see Options#INDEXER_COLLATOR_STRENGTH
*/
protected final IKeyBuilder getKeyBuilder() {
return getIndex().getIndexMetadata().getKeyBuilder();
}
/**
* See {@link #index(TokenBuffer, long, int, String, Reader, boolean)}.
*
* Uses a default filterStopwords value of true
.
*/
public void index(final TokenBuffer buffer, final V docId,
final int fieldId, final String languageCode, final Reader r) {
index(buffer, docId, fieldId, languageCode, r, true/* filterStopwords */);
}
/**
* Index a field in a document.
*
* Note: This method does NOT force a write on the indices. If the buffer
* overflows, then there will be an index write. Once the caller is done
* indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
* remaining in their buffer to the indices.
*
* Note: If a document is pre-existing, then the existing data for that
* document MUST be removed unless you know that the fields to be found in
* the will not have changed (they may have different contents, but the same
* fields exist in the old and new versions of the document).
*
* @param buffer
* Used to buffer writes onto the text index.
* @param docId
* The document identifier.
* @param fieldId
* The field identifier.
* @param languageCode
* The language code -or- null
to use the default
* {@link Locale}.
* @param r
* A reader on the text to be indexed.
* @param filterStopwords
* if true, filter stopwords from the token stream
*
* @see TokenBuffer#flush()
*/
public void index(final TokenBuffer buffer, final V docId,
final int fieldId, final String languageCode, final Reader r,
final boolean filterStopwords) {
/*
* Note: You can invoke this on a read-only index. It is only overflow
* of the TokenBuffer that requires a writable index. Overflow itself
* will only occur on {document,field} tuple boundaries, so it will
* never overflow when indexing a search query.
*/
// assertWritable();
int n = 0;
// tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
final TokenStream tokenStream = getTokenStream(languageCode, r,
filterStopwords);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
final CharTermAttribute term = tokenStream
.getAttribute(CharTermAttribute.class);
buffer.add(docId, fieldId, term.toString());
n++;
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
if (log.isInfoEnabled())
log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId="
+ fieldId);
}
/**
* Tokenize text using an {@link Analyzer} that is appropriate to the
* specified language family.
*
* @param languageCode
* The language code -or- null
to use the default
* {@link Locale}).
*
* @param r
* A reader on the text to be indexed.
*
* @param filterStopwords
* if true, filter stopwords from the token stream
*
* @return The extracted token stream.
*/
protected TokenStream getTokenStream(final String languageCode,
final Reader r, final boolean filterStopwords) {
/*
* Note: This is stripping out stopwords by default.
*
* @todo is it using a language family specific stopword list?
*/
final Analyzer a = getAnalyzer(languageCode, filterStopwords);
TokenStream tokenStream;
tokenStream = a.tokenStream(null/* @todo field? */, r);
// force to lower case.
tokenStream = new LowerCaseFilter(tokenStream);
return tokenStream;
}
/**
* Performs a full text search against indexed documents returning a hit
* list.
*
* The basic algorithm computes cosine between the term-frequency vector of
* the query and the indexed "documents". The cosine may be directly
* interpreted as the "relevance" of a "document" to the query. The query
* and document term-frequency vectors are normalized, so the cosine values
* are bounded in [0.0:1.0]. The higher the cosine the more relevant the
* document is to the query. A cosine of less than .4 is rarely of any
* interest.
*
* The implementation creates and runs a set parallel tasks, one for each
* distinct token found in the query, and waits for those tasks to complete
* or for a timeout to occur. Each task uses a key-range scan on the terms
* index, collecting metadata for the matching "documents" and aggregating
* it on a "hit" for that document. Since the tasks run concurrently, there
* are concurrent writers on the "hits". On a timeout, the remaining tasks
* are interrupted.
*
* The collection of hits is scored and hits that fail a threshold are
* discarded. The remaining hits are placed into a total order and the
* caller is returned an iterator which can read from that order. If the
* operation is interrupted, then only those {@link IHit}s that have already
* been computed will be returned.
*
* @param query
* The query (it will be parsed into tokens).
* @param languageCode
* The language code that should be used when tokenizing the
* query -or- null
to use the default {@link Locale}
* ).
* @param minCosine
* The minimum cosine that will be returned.
* @param maxCosine
* The maximum cosine that will be returned.
* @param minRank
* The min rank of the search result.
* @param maxRank
* The max rank of the search result.
* @param prefixMatch
* When true
, the matches will be on tokens which
* include the query tokens as a prefix. This includes exact
* matches as a special case when the prefix is the entire token,
* but it also allows longer matches. For example,
* free
will be an exact match on free
* but a partial match on freedom
. When
* false
, only exact matches will be made.
* @param matchAllTerms
* if true, return only hits that match all search terms
* @param timeout
* The timeout -or- ZERO (0) for NO timeout (this is equivalent
* to using {@link Long#MAX_VALUE}).
* @param unit
* The unit in which the timeout is expressed.
*
* @return The hit list.
*
* @todo Allow search within field(s). This will be a filter on the range
* iterator that is sent to the data service such that the search
* terms are visited only when they occur in the matching field(s).
*/
public Hiterator> search(final FullTextQuery query) {
final Hit[] a = _search(query);
return new Hiterator>(a);
}
/**
* Perform a range count on a full text query.
*/
public int count(final FullTextQuery query) {
if (cache.containsKey(query)) {
if (log.isInfoEnabled())
log.info("found hits in cache");
return cache.get(query).length;
} else {
if (log.isInfoEnabled())
log.info("did not find hits in cache");
}
// tokenize the query.
final TermFrequencyData qdata = tokenize(query);
// No terms after stopword extraction
if (qdata == null) {
cache.put(query, new Hit[] {});
return 0;
}
/*
* We can run an optimized version of this (just a quick range count)
* but only if the caller does not care about exact match and has
* not specified a regex.
*/
if (qdata.distinctTermCount() == 1 &&
!query.isMatchExact() && query.getMatchRegex() == null) {
final boolean prefixMatch = query.isPrefixMatch();
final Map.Entry e = qdata.getSingletonEntry();
final String termText = e.getKey();
final ITermMetadata md = e.getValue();
final CountIndexTask task1 = new CountIndexTask(termText, 0, 1,
prefixMatch, md.getLocalTermWeight(), this);
return (int) task1.getRangeCount();
} else {
final Hit[] a = _search(query);
return a.length;
}
}
protected TermFrequencyData tokenize(final FullTextQuery query) {
final String q = query.getQuery();
final String languageCode = query.getLanguageCode();
final boolean prefixMatch = query.isPrefixMatch();
// tokenize the query.
final TermFrequencyData qdata;
{
final TokenBuffer buffer = new TokenBuffer(1, this);
/*
* If we are using prefix match ('*' operator) then we don't want to
* filter stopwords from the search query.
*/
final boolean filterStopwords = !prefixMatch;
index(buffer, //
null, // docId // was Long.MIN_VALUE
Integer.MIN_VALUE, // fieldId
languageCode,//
new StringReader(q), //
filterStopwords//
);
if (buffer.size() == 0) {
/*
* There were no terms after stopword extration.
*/
log.warn("No terms after stopword extraction: query=" + query);
return null;
}
qdata = buffer.get(0);
qdata.normalize();
}
return qdata;
}
public Hit[] _search(final FullTextQuery query) {
final String queryStr = query.getQuery();
final String languageCode = query.getLanguageCode();
final boolean prefixMatch = query.isPrefixMatch();
final double minCosine = query.getMinCosine();
final double maxCosine = query.getMaxCosine();
final int minRank = query.getMinRank();
final int maxRank = query.getMaxRank();
final boolean matchAllTerms = query.isMatchAllTerms();
final boolean matchExact = query.isMatchExact();
final String regex = query.getMatchRegex();
long timeout = query.getTimeout();
final TimeUnit unit = query.getTimeUnit();
final long begin = System.currentTimeMillis();
// if (languageCode == null)
// throw new IllegalArgumentException();
if (queryStr == null)
throw new IllegalArgumentException();
if (minCosine < 0d || minCosine > 1d)
throw new IllegalArgumentException();
if (minRank <= 0 || maxRank <= 0)
throw new IllegalArgumentException();
if (minRank > maxRank)
throw new IllegalArgumentException();
if (timeout < 0L)
throw new IllegalArgumentException();
if (unit == null)
throw new IllegalArgumentException();
if (log.isInfoEnabled())
log.info("languageCode=[" + languageCode + "], text=[" + queryStr
+ "], minCosine=" + minCosine
+ ", maxCosine=" + maxCosine
+ ", minRank=" + minRank
+ ", maxRank=" + maxRank
+ ", matchAllTerms=" + matchAllTerms
+ ", prefixMatch=" + prefixMatch
+ ", timeout=" + timeout + ", unit=" + unit);
if (timeout == 0L) {
// treat ZERO as equivalent to MAX_LONG.
timeout = Long.MAX_VALUE;
}
final FullTextQuery cacheKey = query;
Hit[] a;
if (cache.containsKey(cacheKey)) {
if (log.isInfoEnabled())
log.info("found hits in cache");
a = cache.get(cacheKey);
} else {
if (log.isInfoEnabled())
log.info("did not find hits in cache");
// tokenize the query.
final TermFrequencyData qdata = tokenize(query);
// No terms after stopword extraction
if (qdata == null) {
cache.put(cacheKey, a = new Hit[] {});
return a;
}
a = executeQuery(qdata, prefixMatch, timeout, unit);
if (a.length == 0) {
log.info("No hits: languageCode=[" + languageCode + "], query=["
+ queryStr + "]");
cache.put(cacheKey, a);
return a;
}
/*
* If match all is specified, remove any hits with a term count less
* than the number of search tokens. It's also an optimization to
* run the pruning if we're going to do matchExact.
*/
if ((matchAllTerms || matchExact) && qdata.distinctTermCount() > 1) {
final int nterms = qdata.terms.size();
if (log.isInfoEnabled()) {
log.info("matchAll=true, nterms=" + nterms);
log.info("size before: " + a.length);
}
final Hit[] tmp = new Hit[a.length];
int i = 0;
for (Hit hit : a) {
if (hit.getTermCount() == nterms) {
tmp[i++] = hit;
}
}
if (log.isDebugEnabled()) {
log.debug(i);
}
if (i < a.length) {
a = new Hit[i];
System.arraycopy(tmp, 0, a, 0, i);
}
}
/*
* Delegate match exact to subclasses.
*/
if (matchExact) {
a = matchExact(a, queryStr);
}
if (a.length == 0) {
log.warn("No hits after matchAllTerms pruning: languageCode=[" + languageCode + "], query=["
+ queryStr + "]");
cache.put(cacheKey, a);
return a;
}
/*
* Do regex matching.
*/
if (regex != null) {
final Pattern pattern = Pattern.compile(regex);//, Pattern.CASE_INSENSITIVE);
if (log.isDebugEnabled()) {
log.debug("hits before regex: " + a.length);
}
a = applyRegex(a, pattern);
if (log.isDebugEnabled()) {
log.debug("hits after regex: " + a.length);
}
}
if (a.length == 0) {
log.warn("No hits after regex pruning: languageCode=[" + languageCode + "], query=["
+ queryStr + "], regex=[" + regex + "]");
cache.put(cacheKey, a);
return a;
}
/*
* Rank order the hits by relevance.
*
* @todo consider moving documents through a succession of N pools where
* N is the #of distinct terms in the query. The read tasks would halt
* if the size of the pool for N terms reached maxRank. This might (or
* might not) help with triage since we could process hits by pool and
* only compute the cosines for one pool at a time until we had enough
* hits.
*/
if (log.isInfoEnabled())
log.info("Rank ordering "+a.length+" hits by relevance");
final long start = System.currentTimeMillis();
Arrays.sort(a);
if (log.isInfoEnabled()) {
final long sortTime = System.currentTimeMillis() - start;
log.info("sort time: " + sortTime);
}
for (int i = 0; i < a.length; i++) {
a[i].setRank(i+1);
}
cache.put(cacheKey, a);
}
/*
* Take a slice of the hits based on min/max cosine and min/max rank.
*/
a = slice(query, a);
final long elapsed = System.currentTimeMillis() - begin;
if (log.isInfoEnabled())
log.info("Done: " + a.length + " hits in " + elapsed + "ms");
return a;
}
protected Hit[] slice(final FullTextQuery query, Hit[] a) {
final double minCosine = query.getMinCosine();
final double maxCosine = query.getMaxCosine();
final int minRank = query.getMinRank();
final int maxRank = query.getMaxRank();
// if (log.isDebugEnabled()) {
// log.debug("before min/max cosine/rank pruning:");
// for (Hit h : a)
// log.debug(h);
// }
/*
* If maxCosine is specified, prune the hits that are above the max
*/
if (maxCosine < 1.0d) {
// find the first occurrence of a hit that is <= maxCosine
int i = 0;
for (Hit h : a) {
if (h.getCosine() <= maxCosine)
break;
i++;
}
// no hits with relevance less than maxCosine
if (i == a.length) {
return new Hit[] {};
} else {
// copy the hits from that first occurrence to the end
final Hit[] tmp = new Hit[a.length - i];
System.arraycopy(a, i, tmp, 0, tmp.length);
a = tmp;
}
}
/*
* If minCosine is specified, prune the hits that are below the min
*/
if (minCosine > 0.0d) {
// find the first occurrence of a hit that is < minCosine
int i = 0;
for (Hit h : a) {
if (h.getCosine() < minCosine)
break;
i++;
}
// no hits with relevance greater than minCosine
if (i == 0) {
return new Hit[] {};
} else if (i < a.length) {
// copy the hits from 0 up to that first occurrence
final Hit[] tmp = new Hit[i];
System.arraycopy(a, 0, tmp, 0, tmp.length);
a = tmp;
}
}
// exactly one hit
if (minRank > 0 && minRank == maxRank) {
if (minRank > a.length) {
// out of range
return new Hit[] {};
} else {
// in range
return new Hit[] { a[minRank-1] };
}
}
/*
* If minRank is specified, prune the hits that rank higher than the min
*/
if (minRank > 1) {
// no hits above minRank
if (minRank > a.length) {
return new Hit[] {};
} else {
// copy the hits from the minRank to the end
final Hit[] tmp = new Hit[a.length - (minRank-1)];
System.arraycopy(a, minRank-1, tmp, 0, tmp.length);
a = tmp;
}
}
final int newMax = maxRank-minRank+1;
if (log.isDebugEnabled())
log.debug("new max rank: " + newMax);
/*
* If maxRank is specified, prune the hits that rank lower than the max
*/
if (newMax < a.length) {
// copy the hits from the minRank to the end
final Hit[] tmp = new Hit[newMax];
System.arraycopy(a, 0, tmp, 0, tmp.length);
a = tmp;
}
return a;
}
protected Hit[] executeQuery(final TermFrequencyData qdata,
final boolean prefixMatch, final long timeout, final TimeUnit unit) {
final IHitCollector hits;
if (qdata.distinctTermCount() == 1) {
final Map.Entry e = qdata.getSingletonEntry();
final String termText = e.getKey();
final ITermMetadata md = e.getValue();
final CountIndexTask task1 = new CountIndexTask(termText, 0, 1,
prefixMatch, md.getLocalTermWeight(), this);
hits = new SingleTokenHitCollector(task1);
} else {
final List> tasks = new ArrayList>(
qdata.distinctTermCount());
int i = 0;
for (Map.Entry e : qdata.terms.entrySet()) {
final String termText = e.getKey();
final ITermMetadata md = e.getValue();
tasks.add(new CountIndexTask(termText, i++, qdata.terms.size(),
prefixMatch, md.getLocalTermWeight(), this));
}
hits = new MultiTokenHitCollector(tasks);
}
// run the queries.
{
final List> tasks = new ArrayList>(
qdata.distinctTermCount());
int i = 0;
for (Map.Entry e : qdata.terms.entrySet()) {
final String termText = e.getKey();
final ITermMetadata md = e.getValue();
tasks.add(new ReadIndexTask(termText, i++, qdata.terms.size(),
prefixMatch, md.getLocalTermWeight(), this, hits));
}
final ExecutionHelper