org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.jackrabbit.guava.common.collect.AbstractIterator;
import org.apache.jackrabbit.guava.common.collect.Iterables;
import org.apache.jackrabbit.guava.common.collect.Queues;
import org.apache.jackrabbit.guava.common.collect.Sets;
import org.apache.jackrabbit.JcrConstants;
import org.apache.jackrabbit.oak.api.PropertyValue;
import org.apache.jackrabbit.oak.api.Result.SizePrecision;
import org.apache.jackrabbit.oak.plugins.index.cursor.Cursors;
import org.apache.jackrabbit.oak.plugins.index.cursor.PathCursor;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.PathStoredFieldVisitor;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.SpellcheckHelper;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition.IndexingRule;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.SizeEstimator;
import org.apache.jackrabbit.oak.plugins.memory.PropertyValues;
import org.apache.jackrabbit.oak.spi.query.Cursor;
import org.apache.jackrabbit.oak.spi.query.Filter;
import org.apache.jackrabbit.oak.spi.query.Filter.PropertyRestriction;
import org.apache.jackrabbit.oak.spi.query.IndexRow;
import org.apache.jackrabbit.oak.spi.query.QueryConstants;
import org.apache.jackrabbit.oak.spi.query.QueryIndex;
import org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvanceFulltextQueryIndex;
import org.apache.jackrabbit.oak.spi.query.QueryLimits;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.jackrabbit.guava.common.base.Preconditions.checkState;
import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
import static org.apache.jackrabbit.oak.api.Type.STRING;
import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
import static org.apache.jackrabbit.oak.commons.PathUtils.getAncestorPath;
import static org.apache.jackrabbit.oak.commons.PathUtils.getDepth;
import static org.apache.jackrabbit.oak.commons.PathUtils.getName;
import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.VERSION;
import static org.apache.jackrabbit.oak.plugins.index.lucene.TermFactory.newFulltextTerm;
import static org.apache.jackrabbit.oak.plugins.index.lucene.TermFactory.newPathTerm;
import static org.apache.jackrabbit.oak.plugins.index.search.util.IndexHelper.skipTokenization;
import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_PATH;
import static org.apache.lucene.search.BooleanClause.Occur.MUST;
import static org.apache.lucene.search.BooleanClause.Occur.MUST_NOT;
import static org.apache.lucene.search.BooleanClause.Occur.SHOULD;
/**
* Used to query old (compatVersion 1) Lucene indexes.
*
* Provides a QueryIndex that does lookups against a Lucene-based index
*
*
* To define a lucene index on a subtree you have to add an
* oak:index
node.
*
* Under it follows the index definition node that:
*
* - must be of type
oak:QueryIndexDefinition
* - must have the
type
property set to lucene
* - must have the
async
property set to async
*
*
* Optionally you can add
*
* - what subset of property types to be included in the index via the
includePropertyTypes
property
* - a blacklist of property names: what property to be excluded from the index via the
excludePropertyNames
property
* - the
reindex
flag which when set to true
, triggers a full content re-index.
*
* {@code
* {
* NodeBuilder index = root.child("oak:index");
* index.child("lucene")
* .setProperty("jcr:primaryType", "oak:QueryIndexDefinition", Type.NAME)
* .setProperty("type", "lucene")
* .setProperty("async", "async")
* .setProperty("reindex", "true");
* }
* }
* @see QueryIndex
*
*/
public class LuceneIndex implements AdvanceFulltextQueryIndex {
private static final Logger LOG = LoggerFactory.getLogger(LuceneIndex.class);
public static final String NATIVE_QUERY_FUNCTION = "native*lucene";
private static final double MIN_COST = 2.2;
/**
* IndexPaln Attribute name which refers to the path of Lucene index to be used
* to perform query
*/
static final String ATTR_INDEX_PATH = "oak.lucene.indexPath";
/**
* Batch size for fetching results from Lucene queries.
*/
static final int LUCENE_QUERY_BATCH_SIZE = 50;
static final boolean USE_PATH_RESTRICTION = Boolean.getBoolean("oak.luceneUsePath");
static final int MAX_RELOAD_COUNT = Integer.getInteger("oak.luceneMaxReloadCount", 16);
protected final IndexTracker tracker;
private final NodeAggregator aggregator;
private final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("", ""),
new SimpleHTMLEncoder(), null);
public LuceneIndex(IndexTracker tracker, NodeAggregator aggregator) {
this.tracker = tracker;
this.aggregator = aggregator;
}
@Override
public double getMinimumCost() {
return MIN_COST;
}
@Override
public String getIndexName() {
return "lucene";
}
@Override
public List getPlans(Filter filter, List sortOrder, NodeState rootState) {
FullTextExpression ft = filter.getFullTextConstraint();
if (ft == null) {
// no full-text condition: don't use this index,
// as there might be a better one
return Collections.emptyList();
}
String indexPath = LuceneIndexLookupUtil.getOldFullTextIndexPath(rootState, filter, tracker);
if (indexPath == null) { // unusable index
return Collections.emptyList();
}
Set relPaths = getRelativePaths(ft);
if (relPaths.size() > 1) {
LOG.warn("More than one relative parent for query {}", filter.getQueryStatement());
// there are multiple "parents", as in
// "contains(a/x, 'hello') and contains(b/x, 'world')"
return Collections.emptyList();
}
LuceneIndexNode node = tracker.acquireIndexNode(indexPath);
try{
if (node != null){
IndexDefinition defn = node.getDefinition();
LuceneIndexStatistics stats = node.getIndexStatistics();
if (stats != null) {
return Collections.singletonList(planBuilder(filter)
.setEstimatedEntryCount(defn.getFulltextEntryCount(stats.numDocs()))
.setCostPerExecution(defn.getCostPerExecution())
.setCostPerEntry(defn.getCostPerEntry())
.setDeprecated(defn.isDeprecated())
.setAttribute(ATTR_INDEX_PATH, indexPath)
.setDeprecated(defn.isDeprecated())
.build());
}
}
//No index node then no plan possible
return Collections.emptyList();
} finally {
if (node != null){
node.release();
}
}
}
@Override
public double getCost(Filter filter, NodeState root) {
throw new UnsupportedOperationException("Not supported as implementing AdvancedQueryIndex");
}
@Override
public String getPlan(Filter filter, NodeState root) {
throw new UnsupportedOperationException("Not supported as implementing AdvancedQueryIndex");
}
@Override
public String getPlanDescription(IndexPlan plan, NodeState root) {
Filter filter = plan.getFilter();
LuceneIndexNode index = tracker.acquireIndexNode((String) plan.getAttribute(ATTR_INDEX_PATH));
checkState(index != null, "The Lucene index is not available");
try {
FullTextExpression ft = filter.getFullTextConstraint();
Set relPaths = getRelativePaths(ft);
if (relPaths.size() > 1) {
return new MultiLuceneIndex(filter, root, relPaths).getPlan();
}
String parent = relPaths.size() == 0 ? "" : relPaths.iterator().next();
// we only restrict non-full-text conditions if there is
// no relative property in the full-text constraint
boolean nonFullTextConstraints = parent.isEmpty();
String planDesc = getLuceneRequest(filter, null, nonFullTextConstraints, index.getDefinition()) + " ft:(" + ft + ")";
if (!parent.isEmpty()) {
planDesc += " parent:" + parent;
}
return planDesc;
} finally {
index.release();
}
}
@Override
public Cursor query(final Filter filter, final NodeState root) {
throw new UnsupportedOperationException("Not supported as implementing AdvancedQueryIndex");
}
@Override
public Cursor query(final IndexPlan plan, NodeState rootState) {
if (plan.isDeprecated()) {
LOG.warn("This index is deprecated: {}; it is used for query {}. " +
"Please change the query or the index definitions.", plan.getPlanName(), plan.getFilter());
}
final Filter filter = plan.getFilter();
FullTextExpression ft = filter.getFullTextConstraint();
final Set relPaths = getRelativePaths(ft);
if (relPaths.size() > 1) {
return new MultiLuceneIndex(filter, rootState, relPaths).query();
}
final String parent = relPaths.size() == 0 ? "" : relPaths.iterator().next();
// we only restrict non-full-text conditions if there is
// no relative property in the full-text constraint
final boolean nonFullTextConstraints = parent.isEmpty();
final int parentDepth = getDepth(parent);
QueryLimits settings = filter.getQueryLimits();
LuceneResultRowIterator itr = new LuceneResultRowIterator() {
private final Deque queue = Queues.newArrayDeque();
private final Set seenPaths = new HashSet<>();
private ScoreDoc lastDoc;
private int nextBatchSize = LUCENE_QUERY_BATCH_SIZE;
private boolean noDocs = false;
private long lastSearchIndexerVersion;
private int reloadCount;
@Override
protected LuceneResultRow computeNext() {
while (!queue.isEmpty() || loadDocs()) {
return queue.remove();
}
return endOfData();
}
@Override
public int rewoundCount() {
return reloadCount;
}
private LuceneResultRow convertToRow(ScoreDoc doc, IndexSearcher searcher, String excerpt) throws IOException {
IndexReader reader = searcher.getIndexReader();
PathStoredFieldVisitor visitor = new PathStoredFieldVisitor();
reader.document(doc.doc, visitor);
String path = visitor.getPath();
if (path != null) {
if ("".equals(path)) {
path = "/";
}
if (!parent.isEmpty()) {
// TODO OAK-828 this breaks node aggregation
// get the base path
// ensure the path ends with the given
// relative path
// if (!path.endsWith("/" + parent)) {
// continue;
// }
path = getAncestorPath(path, parentDepth);
// avoid duplicate entries
if (seenPaths.contains(path)) {
return null;
}
seenPaths.add(path);
}
return new LuceneResultRow(path, doc.score, excerpt);
}
return null;
}
/**
* Loads the lucene documents in batches
* @return true if any document is loaded
*/
private boolean loadDocs() {
if (noDocs) {
return false;
}
ScoreDoc lastDocToRecord = null;
LuceneIndexNode indexNode = tracker.acquireIndexNode((String) plan.getAttribute(ATTR_INDEX_PATH));
checkState(indexNode != null);
try {
IndexSearcher searcher = indexNode.getSearcher();
LuceneRequestFacade luceneRequestFacade = getLuceneRequest(filter, searcher.getIndexReader(),
nonFullTextConstraints, indexNode.getDefinition());
if (luceneRequestFacade.getLuceneRequest() instanceof Query) {
Query query = (Query) luceneRequestFacade.getLuceneRequest();
TopDocs docs;
long time = System.currentTimeMillis();
checkForIndexVersionChange(searcher);
while (true) {
if (lastDoc != null) {
LOG.debug("loading the next {} entries for query {}", nextBatchSize, query);
docs = searcher.searchAfter(lastDoc, query, nextBatchSize);
} else {
LOG.debug("loading the first {} entries for query {}", nextBatchSize, query);
docs = searcher.search(query, nextBatchSize);
}
time = System.currentTimeMillis() - time;
LOG.debug("... took {} ms", time);
nextBatchSize = (int) Math.min(nextBatchSize * 2L, 100000);
PropertyRestriction restriction = filter.getPropertyRestriction(QueryConstants.REP_EXCERPT);
boolean addExcerpt = restriction != null && restriction.isNotNullRestriction();
Analyzer analyzer = indexNode.getDefinition().getAnalyzer();
if (addExcerpt) {
// setup highlighter
QueryScorer scorer = new QueryScorer(query);
scorer.setExpandMultiTermQuery(true);
highlighter.setFragmentScorer(scorer);
}
for (ScoreDoc doc : docs.scoreDocs) {
String excerpt = null;
if (addExcerpt) {
excerpt = getExcerpt(analyzer, searcher, doc);
}
LuceneResultRow row = convertToRow(doc, searcher, excerpt);
if (row != null) {
queue.add(row);
}
lastDocToRecord = doc;
}
if (queue.isEmpty() && docs.scoreDocs.length > 0) {
lastDoc = lastDocToRecord;
} else {
break;
}
}
} else if (luceneRequestFacade.getLuceneRequest() instanceof SpellcheckHelper.SpellcheckQuery) {
SpellcheckHelper.SpellcheckQuery spellcheckQuery = (SpellcheckHelper.SpellcheckQuery) luceneRequestFacade.getLuceneRequest();
noDocs = true;
SuggestWord[] suggestWords = SpellcheckHelper.getSpellcheck(spellcheckQuery);
// ACL filter spellchecks
Collection suggestedWords = new ArrayList(suggestWords.length);
QueryParser qp = new QueryParser(Version.LUCENE_47, FieldNames.SUGGEST, indexNode.getDefinition().getAnalyzer());
for (SuggestWord suggestion : suggestWords) {
Query query = qp.createPhraseQuery(FieldNames.SUGGEST, suggestion.string);
TopDocs topDocs = searcher.search(query, 100);
if (topDocs.totalHits > 0) {
for (ScoreDoc doc : topDocs.scoreDocs) {
Document retrievedDoc = searcher.doc(doc.doc);
if (filter.isAccessible(retrievedDoc.get(FieldNames.PATH))) {
suggestedWords.add(suggestion.string);
break;
}
}
}
}
queue.add(new LuceneResultRow(suggestedWords));
} else if (luceneRequestFacade.getLuceneRequest() instanceof SuggestHelper.SuggestQuery) {
SuggestHelper.SuggestQuery suggestQuery = (SuggestHelper.SuggestQuery) luceneRequestFacade.getLuceneRequest();
noDocs = true;
List lookupResults = SuggestHelper.getSuggestions(indexNode.getLookup(), suggestQuery);
// ACL filter suggestions
Collection suggestedWords = new ArrayList(lookupResults.size());
QueryParser qp = new QueryParser(Version.LUCENE_47, FieldNames.FULLTEXT, indexNode.getDefinition().getAnalyzer());
for (Lookup.LookupResult suggestion : lookupResults) {
Query query = qp.createPhraseQuery(FieldNames.FULLTEXT, suggestion.key.toString());
TopDocs topDocs = searcher.search(query, 100);
if (topDocs.totalHits > 0) {
for (ScoreDoc doc : topDocs.scoreDocs) {
Document retrievedDoc = searcher.doc(doc.doc);
if (filter.isAccessible(retrievedDoc.get(FieldNames.PATH))) {
suggestedWords.add("{term=" + suggestion.key + ",weight=" + suggestion.value + "}");
break;
}
}
}
}
queue.add(new LuceneResultRow(suggestedWords));
}
} catch (IOException e) {
LOG.warn("query via {} failed.", LuceneIndex.this, e);
} finally {
indexNode.release();
}
if (lastDocToRecord != null) {
this.lastDoc = lastDocToRecord;
}
return !queue.isEmpty();
}
private void checkForIndexVersionChange(IndexSearcher searcher) {
long currentVersion = LucenePropertyIndex.getVersion(searcher);
if (currentVersion != lastSearchIndexerVersion && lastDoc != null){
reloadCount++;
if (reloadCount > MAX_RELOAD_COUNT) {
LOG.error("More than {} index version changes detected for query {}",
MAX_RELOAD_COUNT,
plan);
throw new IllegalStateException("Too many version changes");
}
lastDoc = null;
LOG.info("Change in index version detected {} => {}. Query would be performed without " +
"offset; reload {}", currentVersion, lastSearchIndexerVersion, reloadCount);
}
this.lastSearchIndexerVersion = currentVersion;
}
};
SizeEstimator sizeEstimator = new SizeEstimator() {
@Override
public long getSize() {
LuceneIndexNode indexNode = tracker.acquireIndexNode((String) plan.getAttribute(ATTR_INDEX_PATH));
checkState(indexNode != null);
try {
IndexSearcher searcher = indexNode.getSearcher();
LuceneRequestFacade luceneRequestFacade = getLuceneRequest(filter, searcher.getIndexReader(),
nonFullTextConstraints, indexNode.getDefinition());
if (luceneRequestFacade.getLuceneRequest() instanceof Query) {
Query query = (Query) luceneRequestFacade.getLuceneRequest();
TotalHitCountCollector collector = new TotalHitCountCollector();
searcher.search(query, collector);
int totalHits = collector.getTotalHits();
LOG.debug("Estimated size for query {} is {}", query, totalHits);
return totalHits;
}
LOG.debug("Estimated size: not a Query: {}", luceneRequestFacade.getLuceneRequest());
} catch (IOException e) {
LOG.warn("query via {} failed.", LuceneIndex.this, e);
} finally {
indexNode.release();
}
return -1;
}
};
return new LucenePathCursor(itr, settings, sizeEstimator, filter);
}
private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException {
StringBuilder excerpt = new StringBuilder();
for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
String name = field.name();
// only full text or analyzed fields
if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 2);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
if (excerpt.length() > 0) {
excerpt.append("...");
}
excerpt.append(fragment.toString());
}
break;
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
}
return excerpt.toString();
}
protected static IndexPlan.Builder planBuilder(Filter filter){
return new IndexPlan.Builder()
.setCostPerExecution(0) // we're local. Low-cost
.setCostPerEntry(1)
.setFilter(filter)
.setFulltextIndex(true)
.setEstimatedEntryCount(0) //TODO Fake it to provide constant cost for now
.setIncludesNodeData(false) // we should not include node data
.setDelayed(true); //Lucene is always async
}
/**
* Get the set of relative paths of a full-text condition. For example, for
* the condition "contains(a/b, 'hello') and contains(c/d, 'world'), the set
* { "a", "c" } is returned. If there are no relative properties, then one
* entry is returned (the empty string). If there is no expression, then an
* empty set is returned.
*
* @param ft the full-text expression
* @return the set of relative paths (possibly empty)
*/
private static Set getRelativePaths(FullTextExpression ft) {
if (ft == null) {
// there might be no full-text constraint when using the
// LowCostLuceneIndexProvider which is used for testing
// TODO if the LowCostLuceneIndexProvider is removed, we should do
// the following instead:
// throw new
// IllegalStateException("Lucene index is used even when no full-text conditions are used for filter "
// + filter);
return Collections.emptySet();
}
final HashSet relPaths = new HashSet();
ft.accept(new FullTextVisitor.FullTextVisitorBase() {
@Override
public boolean visit(FullTextTerm term) {
String p = term.getPropertyName();
if (p == null) {
relPaths.add("");
} else if (p.startsWith("../") || p.startsWith("./")) {
throw new IllegalArgumentException("Relative parent is not supported:" + p);
} else if (getDepth(p) > 1) {
String parent = getParentPath(p);
relPaths.add(parent);
} else {
relPaths.add("");
}
return true;
}
});
return relPaths;
}
/**
* Get the Lucene query for the given filter.
*
* @param filter the filter, including full-text constraint
* @param reader the Lucene reader
* @param nonFullTextConstraints whether non-full-text constraints (such a
* path, node type, and so on) should be added to the Lucene
* query
* @param indexDefinition nodestate that contains the index definition
* @return the Lucene query
*/
private static LuceneRequestFacade getLuceneRequest(Filter filter, IndexReader reader, boolean nonFullTextConstraints,
LuceneIndexDefinition indexDefinition) {
List qs = new ArrayList();
Analyzer analyzer = indexDefinition.getAnalyzer();
FullTextExpression ft = filter.getFullTextConstraint();
if (ft == null) {
// there might be no full-text constraint
// when using the LowCostLuceneIndexProvider
// which is used for testing
} else {
qs.add(getFullTextQuery(ft, analyzer, reader));
}
PropertyRestriction pr = filter.getPropertyRestriction(NATIVE_QUERY_FUNCTION);
if (pr != null) {
String query = String.valueOf(pr.first.getValue(pr.first.getType()));
QueryParser queryParser = new QueryParser(VERSION, "", indexDefinition.getAnalyzer());
if (query.startsWith("mlt?")) {
String mltQueryString = query.replace("mlt?", "");
if (reader != null) {
Query moreLikeThis = MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
if (moreLikeThis != null) {
qs.add(moreLikeThis);
}
}
}
if (query.startsWith("spellcheck?")) {
String spellcheckQueryString = query.replace("spellcheck?", "");
if (reader != null) {
return new LuceneRequestFacade(SpellcheckHelper.getSpellcheckQuery(spellcheckQueryString, reader));
}
} else if (query.startsWith("suggest?")) {
String suggestQueryString = query.replace("suggest?", "");
if (reader != null) {
return new LuceneRequestFacade(SuggestHelper.getSuggestQuery(suggestQueryString));
}
} else {
try {
qs.add(queryParser.parse(query));
} catch (ParseException e) {
throw new RuntimeException(e);
}
}
} else if (nonFullTextConstraints) {
addNonFullTextConstraints(qs, filter, reader, analyzer,
indexDefinition);
}
if (qs.size() == 0) {
return new LuceneRequestFacade(new MatchAllDocsQuery());
}
return LucenePropertyIndex.performAdditionalWraps(qs);
}
private static void addNonFullTextConstraints(List qs,
Filter filter, IndexReader reader, Analyzer analyzer, IndexDefinition indexDefinition) {
if (!filter.matchesAllTypes()) {
addNodeTypeConstraints(qs, filter);
}
String path = filter.getPath();
switch (filter.getPathRestriction()) {
case ALL_CHILDREN:
if (USE_PATH_RESTRICTION) {
if ("/".equals(path)) {
break;
}
if (!path.endsWith("/")) {
path += "/";
}
qs.add(new PrefixQuery(newPathTerm(path)));
}
break;
case DIRECT_CHILDREN:
if (USE_PATH_RESTRICTION) {
if (!path.endsWith("/")) {
path += "/";
}
qs.add(new PrefixQuery(newPathTerm(path)));
}
break;
case EXACT:
qs.add(new TermQuery(newPathTerm(path)));
break;
case PARENT:
if (denotesRoot(path)) {
// there's no parent of the root node
// we add a path that can not possibly occur because there
// is no way to say "match no documents" in Lucene
qs.add(new TermQuery(new Term(FieldNames.PATH, "///")));
} else {
qs.add(new TermQuery(newPathTerm(getParentPath(path))));
}
break;
case NO_RESTRICTION:
break;
}
//Fulltext index definition used by LuceneIndex only works with old format
//which is not nodeType based. So just use the nt:base index
IndexingRule rule = indexDefinition.getApplicableIndexingRule(JcrConstants.NT_BASE);
for (PropertyRestriction pr : filter.getPropertyRestrictions()) {
if (pr.first == null && pr.last == null) {
// we only support equality or range queries,
// but not "in", "is null", "is not null"
// queries (OAK-1208)
continue;
}
// check excluded properties and types
if (isExcludedProperty(pr, rule)) {
continue;
}
String name = pr.propertyName;
if (QueryConstants.REP_EXCERPT.equals(name) || QueryConstants.OAK_SCORE_EXPLANATION.equals(name) || QueryConstants.REP_FACET.equals(name)) {
continue;
}
if (JCR_PRIMARYTYPE.equals(name)) {
continue;
}
if (QueryConstants.RESTRICTION_LOCAL_NAME.equals(name)) {
continue;
}
if (skipTokenization(name)) {
qs.add(new TermQuery(new Term(name, pr.first
.getValue(STRING))));
continue;
}
String first = null;
String last = null;
boolean isLike = pr.isLike;
// TODO what to do with escaped tokens?
if (pr.first != null) {
first = pr.first.getValue(STRING);
first = first.replace("\\", "");
}
if (pr.last != null) {
last = pr.last.getValue(STRING);
last = last.replace("\\", "");
}
if (isLike) {
// Note: the code below has two problems:
// - Does not deal with escaped wildcard characters (OAK-9885).
// - Does not apply the prefix query optimization: the first block of the if condition below is
// never executed because the guard condition is always false (OAK-9881).
// The correct logic is in LucenePropertyIndex#createLikeQuery.
// Leaving the code as it is, because this class is deprecated, as it is just for compatVersion=1
first = first.replace('%', WildcardQuery.WILDCARD_STRING);
first = first.replace('_', WildcardQuery.WILDCARD_CHAR);
int indexOfWS = first.indexOf(WildcardQuery.WILDCARD_STRING);
int indexOfWC = first.indexOf(WildcardQuery.WILDCARD_CHAR);
int len = first.length();
if (indexOfWS == len || indexOfWC == len) {
// remove trailing "*" for prefix query
first = first.substring(0, first.length() - 1);
if (JCR_PATH.equals(name)) {
qs.add(new PrefixQuery(newPathTerm(first)));
} else {
qs.add(new PrefixQuery(new Term(name, first)));
}
} else {
if (JCR_PATH.equals(name)) {
qs.add(new WildcardQuery(newPathTerm(first)));
} else {
qs.add(new WildcardQuery(new Term(name, first)));
}
}
continue;
}
if (first != null && first.equals(last) && pr.firstIncluding
&& pr.lastIncluding) {
if (JCR_PATH.equals(name)) {
qs.add(new TermQuery(newPathTerm(first)));
} else {
if ("*".equals(name)) {
addReferenceConstraint(first, qs, reader);
} else {
for (String t : tokenize(first, analyzer)) {
qs.add(new TermQuery(new Term(name, t)));
}
}
}
continue;
}
first = tokenizeAndPoll(first, analyzer);
last = tokenizeAndPoll(last, analyzer);
qs.add(TermRangeQuery.newStringRange(name, first, last,
pr.firstIncluding, pr.lastIncluding));
}
}
private static String tokenizeAndPoll(String token, Analyzer analyzer){
if (token != null) {
List tokens = tokenize(token, analyzer);
if (!tokens.isEmpty()) {
token = tokens.get(0);
}
}
return token;
}
private static boolean isExcludedProperty(PropertyRestriction pr,
IndexingRule rule) {
String name = pr.propertyName;
if (name.contains("/")) {
// lucene cannot handle child-level property restrictions
return true;
}
PropertyDefinition pd = rule.getConfig(name);
// check name
if(pd == null || !pd.index){
return true;
}
// check type
Integer type = null;
if (pr.first != null) {
type = pr.first.getType().tag();
} else if (pr.last != null) {
type = pr.last.getType().tag();
} else if (pr.list != null && !pr.list.isEmpty()) {
type = pr.list.get(0).getType().tag();
}
if (type != null) {
if (!includePropertyType(type, rule)) {
return true;
}
}
return false;
}
private static boolean includePropertyType(int type, IndexingRule rule){
if(rule.propertyTypes < 0){
return false;
}
return (rule.propertyTypes & (1 << type)) != 0;
}
private static void addReferenceConstraint(String uuid, List qs,
IndexReader reader) {
if (reader == null) {
// getPlan call
qs.add(new TermQuery(new Term("*", uuid)));
return;
}
// reference query
BooleanQuery bq = new BooleanQuery();
Collection fields = MultiFields.getIndexedFields(reader);
for (String f : fields) {
bq.add(new TermQuery(new Term(f, uuid)), SHOULD);
}
qs.add(bq);
}
private static void addNodeTypeConstraints(List qs, Filter filter) {
BooleanQuery bq = new BooleanQuery();
for (String type : filter.getPrimaryTypes()) {
bq.add(new TermQuery(new Term(JCR_PRIMARYTYPE, type)), SHOULD);
}
for (String type : filter.getMixinTypes()) {
bq.add(new TermQuery(new Term(JCR_MIXINTYPES, type)), SHOULD);
}
qs.add(bq);
}
static Query getFullTextQuery(FullTextExpression ft, final Analyzer analyzer, final IndexReader reader) {
// a reference to the query, so it can be set in the visitor
// (a "non-local return")
final AtomicReference result = new AtomicReference();
ft.accept(new FullTextVisitor() {
@Override
public boolean visit(FullTextContains contains) {
return contains.getBase().accept(this);
}
@Override
public boolean visit(FullTextOr or) {
BooleanQuery q = new BooleanQuery();
for (FullTextExpression e : or.list) {
Query x = getFullTextQuery(e, analyzer, reader);
q.add(x, SHOULD);
}
result.set(q);
return true;
}
@Override
public boolean visit(FullTextAnd and) {
BooleanQuery q = new BooleanQuery();
for (FullTextExpression e : and.list) {
Query x = getFullTextQuery(e, analyzer, reader);
/* Only unwrap the clause if MUST_NOT(x) */
boolean hasMustNot = false;
if (x instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) x;
if ((bq.getClauses().length == 1) &&
(bq.getClauses()[0].getOccur() == Occur.MUST_NOT)) {
hasMustNot = true;
q.add(bq.getClauses()[0]);
}
}
if (!hasMustNot) {
q.add(x, MUST);
}
}
result.set(q);
return true;
}
@Override
public boolean visit(FullTextTerm term) {
return visitTerm(term.getPropertyName(), term.getText(), term.getBoost(), term.isNot());
}
private boolean visitTerm(String propertyName, String text, String boost, boolean not) {
String p = propertyName;
if (p != null && p.indexOf('/') >= 0) {
p = getName(p);
}
Query q = tokenToQuery(text, p, analyzer, reader);
if (q == null) {
return false;
}
if (boost != null) {
q.setBoost(Float.parseFloat(boost));
}
if (not) {
BooleanQuery bq = new BooleanQuery();
bq.add(q, MUST_NOT);
result.set(bq);
} else {
result.set(q);
}
return true;
}
});
return result.get();
}
static Query tokenToQuery(String text, String fieldName, Analyzer analyzer, IndexReader reader) {
if (analyzer == null) {
return null;
}
List tokens = tokenize(text, analyzer);
if (tokens.isEmpty()) {
// TODO what should be returned in the case there are no tokens?
return new BooleanQuery();
}
if (tokens.size() == 1) {
String token = tokens.iterator().next();
if (hasFulltextToken(token)) {
return new WildcardQuery(newFulltextTerm(token, fieldName));
} else {
return new TermQuery(newFulltextTerm(token, fieldName));
}
} else {
if (hasFulltextToken(tokens)) {
BooleanQuery bq = new BooleanQuery();
for(String token: tokens){
if (hasFulltextToken(token)) {
bq.add(new WildcardQuery(newFulltextTerm(token, fieldName)), Occur.MUST);
} else {
bq.add(new TermQuery(newFulltextTerm(token, fieldName)), Occur.MUST);
}
}
return bq;
} else {
PhraseQuery pq = new PhraseQuery();
for (String t : tokens) {
pq.add(newFulltextTerm(t, fieldName));
}
return pq;
}
}
}
private static boolean hasFulltextToken(List tokens) {
for (String token : tokens) {
if (hasFulltextToken(token)) {
return true;
}
}
return false;
}
private static boolean hasFulltextToken(String token) {
for (char c : fulltextTokens) {
if (token.indexOf(c) != -1) {
return true;
}
}
return false;
}
private static char[] fulltextTokens = new char[] { '*', '?' };
/**
* Tries to merge back tokens that are split on relevant fulltext query
* wildcards ('*' or '?')
*
*
* @param text
* @param analyzer
* @return
*/
static List tokenize(String text, Analyzer analyzer) {
List tokens = new ArrayList();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(FieldNames.FULLTEXT,
new StringReader(text));
CharTermAttribute termAtt = stream
.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = stream
.addAttribute(OffsetAttribute.class);
// TypeAttribute type = stream.addAttribute(TypeAttribute.class);
stream.reset();
int poz = 0;
boolean hasFulltextToken = false;
StringBuilder token = new StringBuilder();
while (stream.incrementToken()) {
String term = termAtt.toString();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
if (start > poz) {
for (int i = poz; i < start; i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
hasFulltextToken = true;
}
}
}
}
poz = end;
if (hasFulltextToken) {
token.append(term);
hasFulltextToken = false;
} else {
if (token.length() > 0) {
tokens.add(token.toString());
}
token = new StringBuilder();
token.append(term);
}
}
// consume to the end of the string
if (poz < text.length()) {
for (int i = poz; i < text.length(); i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
}
}
}
}
if (token.length() > 0) {
tokens.add(token.toString());
}
stream.end();
} catch (IOException e) {
LOG.error("Building fulltext query failed", e.getMessage());
return null;
} finally {
try {
if (stream != null) {
stream.close();
}
} catch (IOException e) {
// ignore
}
}
return tokens;
}
@Override
public NodeAggregator getNodeAggregator() {
return aggregator;
}
static class LuceneResultRow {
final String path;
final double score;
final Iterable suggestWords;
final boolean isVirtual;
final String excerpt;
LuceneResultRow(String path, double score, String excerpt) {
this.isVirtual = false;
this.path = path;
this.score = score;
this.excerpt = excerpt;
this.suggestWords = Collections.emptySet();
}
LuceneResultRow(Iterable suggestWords) {
this.isVirtual = true;
this.path = "/";
this.score = 1.0d;
this.suggestWords = suggestWords;
this.excerpt = null;
}
@Override
public String toString() {
return String.format("%s (%1.2f)", path, score);
}
}
/**
* A cursor over Lucene results. The result includes the path,
* and the jcr:score pseudo-property as returned by Lucene.
*/
static class LucenePathCursor implements Cursor {
private final int TRAVERSING_WARNING = Integer.getInteger("oak.traversing.warning", 10000);
private final Cursor pathCursor;
LuceneResultRow currentRow;
private final SizeEstimator sizeEstimator;
private long estimatedSize;
LucenePathCursor(final LuceneResultRowIterator it, QueryLimits settings, SizeEstimator sizeEstimator, Filter filter) {
this.sizeEstimator = sizeEstimator;
Iterator pathIterator = new Iterator() {
private int readCount;
private int rewoundCount;
@Override
public boolean hasNext() {
return it.hasNext();
}
@Override
public String next() {
if (it.rewoundCount() > rewoundCount) {
readCount = 0;
rewoundCount = it.rewoundCount();
}
currentRow = it.next();
readCount++;
if (readCount % TRAVERSING_WARNING == 0) {
Cursors.checkReadLimit(readCount, settings);
if (readCount == 2 * TRAVERSING_WARNING) {
LOG.warn("Index-Traversed {} nodes with filter {}", readCount, filter,
new Exception("call stack"));
} else {
LOG.warn("Index-Traversed {} nodes with filter {}", readCount, filter);
}
}
return currentRow.path;
}
@Override
public void remove() {
it.remove();
}
};
pathCursor = new PathCursor(pathIterator, true, settings);
}
@Override
public boolean hasNext() {
return pathCursor.hasNext();
}
@Override
public void remove() {
pathCursor.remove();
}
@Override
public IndexRow next() {
final IndexRow pathRow = pathCursor.next();
return new IndexRow() {
@Override
public boolean isVirtualRow() {
return currentRow.isVirtual;
}
@Override
public String getPath() {
return pathRow.getPath();
}
@Override
public PropertyValue getValue(String columnName) {
// overlay the score
if (QueryConstants.JCR_SCORE.equals(columnName)) {
return PropertyValues.newDouble(currentRow.score);
}
if (QueryConstants.REP_SPELLCHECK.equals(columnName) || QueryConstants.REP_SUGGEST.equals(columnName)) {
return PropertyValues.newString(Iterables.toString(currentRow.suggestWords));
}
if (QueryConstants.REP_EXCERPT.equals(columnName)) {
return PropertyValues.newString(currentRow.excerpt);
}
return pathRow.getValue(columnName);
}
};
}
@Override
public long getSize(SizePrecision precision, long max) {
if (estimatedSize != 0) {
return estimatedSize;
}
return estimatedSize = sizeEstimator.getSize();
}
}
static abstract class LuceneResultRowIterator extends AbstractIterator {
abstract int rewoundCount();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy