org.apache.solr.handler.tagger.TaggerRequestHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Show all versions of solr-core Show documentation
Apache Solr (module: core)
/*
* This software was produced for the U. S. Government
* under Contract No. W15P7T-11-C-F600, and is
* subject to the Rights in Noncommercial Computer Software
* and Noncommercial Computer Software Documentation
* Clause 252.227-7014 (JUN 1995)
*
* Copyright 2013 The MITRE Corporation. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.tagger;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import com.google.common.io.CharStreams;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.QParser;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.search.SyntaxError;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Scans posted text, looking for matching strings in the Solr index.
* The public static final String members are request parameters.
* This handler is also called the "SolrTextTagger".
*
* @since 7.4.0
*/
public class TaggerRequestHandler extends RequestHandlerBase {
/** Request parameter. */
public static final String OVERLAPS = "overlaps";
/** Request parameter. */
public static final String TAGS_LIMIT = "tagsLimit";
/** Request parameter. */
public static final String MATCH_TEXT = "matchText";
/** Request parameter. */
public static final String SKIP_ALT_TOKENS = "skipAltTokens";
/** Request parameter. */
public static final String IGNORE_STOPWORDS = "ignoreStopwords";
/** Request parameter. */
public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
public String getDescription() {
return "Processes input text to find matching tokens stored in the index.";
}
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
//--Read params
final String indexedField = req.getParams().get("field");
if (indexedField == null)
throw new RuntimeException("required param 'field'");
final TagClusterReducer tagClusterReducer =
chooseTagClusterReducer(req.getParams().get(OVERLAPS));
final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
if (idSchemaField == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" +
"uniqueKey in the schema.");//TODO this could be relaxed
}
final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS,
fieldHasIndexedStopFilter(indexedField, req));
//--Get posted data
Reader inputReader = null;
Iterable streams = req.getContentStreams();
if (streams != null) {
Iterator iter = streams.iterator();
if (iter.hasNext()) {
inputReader = iter.next().getReader();
}
if (iter.hasNext()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass().getSimpleName()+" does not support multiple ContentStreams"); //TODO support bulk tagging?
}
}
if (inputReader == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass().getSimpleName()+" requires text to be POSTed to it");
}
// We may or may not need to read the input into a string
final InputStringLazy inputStringFuture = new InputStringLazy(inputReader);
final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture);
final String inputString;//only populated if needed
if (addMatchText || inputStringFuture.inputString != null) {
//Read the input fully into a String buffer that we'll need later,
// then replace the input with a reader wrapping the buffer.
inputString = inputStringFuture.call();
inputReader.close();
inputReader = new StringReader(inputString);
} else {
inputString = null;//not used
}
final SolrIndexSearcher searcher = req.getSearcher();
final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
final List tags = new ArrayList(2000);
try {
Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
if (terms == null)
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"field " + indexedField + " has no indexed data");
Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer,
skipAltTokens, ignoreStopWords) {
@SuppressWarnings("unchecked")
@Override
protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
if (tags.size() >= tagsLimit)
return;
if (offsetCorrector != null) {
int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
if (offsetPair == null) {
log.debug("Discarded offsets [{}, {}] because couldn't balance XML.",
startOffset, endOffset);
return;
}
startOffset = offsetPair[0];
endOffset = offsetPair[1];
}
NamedList tag = new NamedList();
tag.add("startOffset", startOffset);
tag.add("endOffset", endOffset);
if (addMatchText)
tag.add("matchText", inputString.substring(startOffset, endOffset));
//below caches, and also flags matchDocIdsBS
tag.add("ids", lookupSchemaDocIds(docIdsKey));
tags.add(tag);
}
Map