/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.component;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.LongSummaryStatistics;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A component that can be used in isolation, or in conjunction with {@link QueryComponent} to
* identify & score "phrases" found in the input string, based on shingles in indexed fields.
*
* The most common way to use this component is in conjunction with field that use {@link
* ShingleFilterFactory} on both the index
and query
analyzers. An example
* field type configuration would be something like this...
*
*
* <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
* <analyzer type="index">
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
* </analyzer>
* <analyzer type="query">
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
* </analyzer>
* </fieldType>
*
*
* ...where the query
analyzer's maxShingleSize="7"
determines the
* maximum possible phrase length that can be hueristically deduced, the index
* analyzer's maxShingleSize="3"
determines the accuracy of phrases identified. The
* large the indexed maxShingleSize
the higher the accuracy. Both analyzers must
* include minShingleSize="2" outputUnigrams="true"
.
*
*
With a field type like this, one or more fields can be specified (with weights) via a
* phrases.fields
param to request that this component identify possible phrases in the input
* q
param, or an alternative phrases.q
override param. The identified
* phrases will include their scores relative each field specified, as well an overal weighted score
* based on the field weights provided by the client. Higher score values indicate a greater
* confidence in the Phrase.
*
*
NOTE: In a distributed request, this component uses a single phase (piggy backing on
* the {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use)
* to collect all field & shingle stats. No "refinement" requests are used.
*
* @lucene.experimental
*/
public class PhrasesIdentificationComponent extends SearchComponent {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* The only shard purpose that will cause this component to do work & return data during shard
* req
*/
public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS;
/**
* Name, also used as a request param to identify whether the user query concerns this component
*/
public static final String COMPONENT_NAME = "phrases";
// TODO: ideally these should live in a commons.params class?
public static final String PHRASE_INPUT = "phrases.q";
public static final String PHRASE_FIELDS = "phrases.fields";
public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field";
public static final String PHRASE_SUMMARY_PRE = "phrases.pre";
public static final String PHRASE_SUMMARY_POST = "phrases.post";
public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index";
public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query";
@Override
public void prepare(ResponseBuilder rb) throws IOException {
final SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
if (params.getBool(ShardParams.IS_SHARD, false)) {
// only one stage/purpose where we should do any work on a shard
if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) {
return;
}
}
// if we're still here, then we should parse & validate our input,
// putting it in the request context so our process method knows it should do work
rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req));
}
@Override
public int distributedProcess(ResponseBuilder rb) {
final PhrasesContextData contextData =
(PhrasesContextData) rb.req.getContext().get(this.getClass());
if (null == contextData) {
// if prepare didn't give us anything to work with, then we should do nothing
return ResponseBuilder.STAGE_DONE;
}
if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
return ResponseBuilder.STAGE_EXECUTE_QUERY;
} else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
// if we're being used in conjunction with QueryComponent, it should have already created
// (in this staged) the only ShardRequest we need...
for (ShardRequest sreq : rb.outgoing) {
if (0 != (SHARD_PURPOSE & sreq.purpose)) {
return ResponseBuilder.STAGE_GET_FIELDS;
}
}
// ...if we can't find it, then evidently we're being used in isolation,
// and we need to create our own ShardRequest...
ShardRequest sreq = new ShardRequest();
sreq.purpose = SHARD_PURPOSE;
sreq.params = new ModifiableSolrParams(rb.req.getParams());
sreq.params.remove(ShardParams.SHARDS);
rb.addRequest(this, sreq);
return ResponseBuilder.STAGE_GET_FIELDS;
} else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
// NOTE: we don't do any actual work in this stage, but we need to ensure that even if we are
// being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS.
// so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results
// (w/o needing extra code paths for merging phrase results when QueryComponent is/is not
// used)
return ResponseBuilder.STAGE_DONE;
}
return ResponseBuilder.STAGE_DONE;
}
@Override
public void finishStage(ResponseBuilder rb) {
// NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with
// QueryComponent, we don't want to add our results to the response until *after*
// QueryComponent adds the main DocList
final PhrasesContextData contextData =
(PhrasesContextData) rb.req.getContext().get(this.getClass());
if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) {
// if prepare didn't give us anything to work with, or this isn't our stage, then do nothing
return;
}
// sanity check: the shard requests we use/piggy-back on should only hapen once per shard,
// but let's future proof ourselves against the possibility that some shards might get/respond
// to the same request "purpose" multiple times...
final BitSet shardsHandled = new BitSet(rb.shards.length);
// Collect Shard responses
for (ShardRequest sreq : rb.finished) {
if (0 != (sreq.purpose & SHARD_PURPOSE)) {
for (ShardResponse shardRsp : sreq.responses) {
final int shardNum = rb.getShardNum(shardRsp.getShard());
if (!shardsHandled.get(shardNum)) {
shardsHandled.set(shardNum);
// shards.tolerant=true can cause nulls on exceptions/errors
// if we don't get phrases/stats from a shard, just ignore that shard
final SolrResponse rsp = shardRsp.getSolrResponse();
if (null == rsp) continue;
final NamedList top = rsp.getResponse();
if (null == top) continue;
@SuppressWarnings({"unchecked"})
final NamedList phrasesWrapper = (NamedList) top.get("phrases");
if (null == phrasesWrapper) continue;
@SuppressWarnings({"unchecked"})
final List> shardPhrases =
(List>) phrasesWrapper.get("_all");
if (null == shardPhrases) continue;
Phrase.populateStats(contextData.allPhrases, shardPhrases);
}
}
}
}
scoreAndAddResultsToResponse(rb, contextData);
}
@Override
public void process(ResponseBuilder rb) throws IOException {
final PhrasesContextData contextData =
(PhrasesContextData) rb.req.getContext().get(this.getClass());
if (null == contextData) {
// if prepare didn't give us anything to work with, then we should do nothing
return;
}
// regardless of single node / shard, we need local stats...
Phrase.populateStats(
contextData.allPhrases, contextData.fieldWeights.keySet(), rb.req.getSearcher());
if (rb.req.getParams().getBool(ShardParams.IS_SHARD, false)) {
// shard request, return stats for all phrases (in original order)
SimpleOrderedMap output = new SimpleOrderedMap<>();
output.add("_all", Phrase.formatShardResponse(contextData.allPhrases));
// TODO: might want to add numDocs() & getSumTotalTermFreq(f)/getDocCount(f) stats from each
// field... so that we can sum/merge them for use in scoring?
rb.rsp.add("phrases", output);
} else {
// full single node request...
scoreAndAddResultsToResponse(rb, contextData);
}
}
/**
* Helper method (suitable for both single node & distributed coordinator node) to score,
* sort, and format the end user response once all phrases have been populated with stats.
*/
private void scoreAndAddResultsToResponse(
final ResponseBuilder rb, final PhrasesContextData contextData) {
assert null != contextData : "Should not be called if no phrase data to use";
if (null == contextData) {
// if prepare didn't give us anything to work with, then we should do nothing
return;
}
SimpleOrderedMap output = new SimpleOrderedMap<>();
rb.rsp.add("phrases", output);
output.add("input", contextData.rawInput);
if (0 == contextData.allPhrases.size()) {
// w/o any phrases, the summary is just the input again...
output.add("summary", contextData.rawInput);
output.add("details", Collections.emptyList());
return;
}
Phrase.populateScores(contextData);
final int maxPosition =
contextData.allPhrases.get(contextData.allPhrases.size() - 1).getPositionEnd();
final List validScoringPhrasesSorted =
contextData.allPhrases.stream()
// TODO: ideally this cut off of "0.0" should be a request option...
// so users can tune how aggresive/conservative they want to be in finding phrases
// but for that to be useful, we need:
// - more hard & fast documentation about the "range" of scores that may be returned
// - "useful" scores for single words
.filter(p -> 0.0D < p.getTotalScore())
.sorted(Comparator.comparing((p -> p.getTotalScore()), Collections.reverseOrder()))
.collect(Collectors.toList());
// we want to return only high scoring phrases that don't overlap w/higher scoring phrase
final BitSet positionsCovered = new BitSet(maxPosition + 1);
final List results = new ArrayList<>(maxPosition);
for (Phrase phrase : validScoringPhrasesSorted) {
final BitSet phrasePositions = phrase.getPositionsBitSet();
if (!phrasePositions.intersects(positionsCovered)) {
// we can use this phrase, record it...
positionsCovered.or(phrasePositions);
results.add(phrase);
} // else: overlaps higher scoring position(s), skip this phrase
if (positionsCovered.cardinality() == maxPosition + 1) {
// all positions are covered, so we can bail out and skip the rest
break;
}
}
// a "quick summary" of the suggested parsing
output.add("summary", contextData.summarize(results));
// useful user level info on every (high scoring) phrase found (in current, descending score,
// order)
output.add("details", results.stream().map(p -> p.getDetails()).collect(Collectors.toList()));
}
@Override
public String getDescription() {
return "Phrases Identification Component";
}
/**
* Simple container for all request options and data this component needs to store in the Request
* Context
*
* @lucene.internal
*/
public static final class PhrasesContextData {
public final String rawInput;
public final int maxIndexedPositionLength;
public final int maxQueryPositionLength;
public final Map fieldWeights;
public final SchemaField analysisField;
public final List allPhrases;
public final String summaryPre;
public final String summaryPost;
// TODO: add an option to bias field weights based on sumTTF of the fields
// (easy enough to "sum the sums" across multiple shards before scoring)
/**
* Parses the params included in this request, throwing appropriate user level Exceptions for
* invalid input, and returning a PhrasesContextData
suitable for use in this
* request.
*/
public static PhrasesContextData parseAndValidateRequest(final SolrQueryRequest req)
throws SolrException {
return new PhrasesContextData(req);
}
private PhrasesContextData(final SolrQueryRequest req) throws SolrException {
final SolrParams params = req.getParams();
this.rawInput = params.get(PHRASE_INPUT, params.get(CommonParams.Q));
if (null == this.rawInput) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
"phrase identification requires a query string or " + PHRASE_INPUT + " param override");
}
{ // field weights & analysis field...
SchemaField tmpAnalysisField = null;
Map tmpWeights = new TreeMap<>();
final String analysisFieldName = params.get(PHRASE_ANALYSIS_FIELD);
if (null != analysisFieldName) {
tmpAnalysisField = req.getSchema().getFieldOrNull(analysisFieldName);
if (null == tmpAnalysisField) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
PHRASE_ANALYSIS_FIELD
+ " param specifies a field name that does not exist: "
+ analysisFieldName);
}
}
final Map rawFields =
SolrPluginUtils.parseFieldBoosts(params.getParams(PHRASE_FIELDS));
if (rawFields.isEmpty()) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
PHRASE_FIELDS
+ " param must specify a (weighted) list of fields "
+ "to evaluate for phrase identification");
}
for (Map.Entry entry : rawFields.entrySet()) {
final SchemaField field = req.getSchema().getFieldOrNull(entry.getKey());
if (null == field) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
PHRASE_FIELDS
+ " param contains a field name that does not exist: "
+ entry.getKey());
}
if (null == tmpAnalysisField) {
tmpAnalysisField = field;
}
if (null == analysisFieldName) {
if (!field.getType().equals(tmpAnalysisField.getType())) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
"All fields specified in "
+ PHRASE_FIELDS
+ " must have the same fieldType, "
+ "or the advanced "
+ PHRASE_ANALYSIS_FIELD
+ " option must specify an override");
}
}
// if a weight isn't specified, assume "1.0"
final double weight = null == entry.getValue() ? 1.0D : entry.getValue();
if (weight < 0) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
PHRASE_FIELDS
+ " param must use non-negative weight value for field "
+ field.getName());
}
tmpWeights.put(entry.getKey(), weight);
}
assert null != tmpAnalysisField;
this.analysisField = tmpAnalysisField;
this.fieldWeights = Collections.unmodifiableMap(tmpWeights);
}
{ // index/query max phrase sizes...
final FieldType ft = analysisField.getType();
this.maxIndexedPositionLength =
req.getParams().getInt(PHRASE_INDEX_MAXLEN, getMaxShingleSize(ft.getIndexAnalyzer()));
if (this.maxIndexedPositionLength < 0) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
"Unable to determine max position length of indexed phrases using "
+ "index analyzer for analysis field: "
+ analysisField.getName()
+ " and no override detected using param: "
+ PHRASE_INDEX_MAXLEN);
}
this.maxQueryPositionLength =
req.getParams().getInt(PHRASE_QUERY_MAXLEN, getMaxShingleSize(ft.getQueryAnalyzer()));
if (this.maxQueryPositionLength < 0) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
"Unable to determine max position length of query phrases using "
+ "query analyzer for analysis field: "
+ analysisField.getName()
+ " and no override detected using param: "
+ PHRASE_QUERY_MAXLEN);
}
if (this.maxQueryPositionLength < this.maxIndexedPositionLength) {
throw new SolrException(
ErrorCode.BAD_REQUEST,
"Effective value of "
+ PHRASE_INDEX_MAXLEN
+ " (either from index analyzer shingle factory, "
+ " or expert param override) must be less then or equal to the effective value of "
+ PHRASE_QUERY_MAXLEN
+ " (either from query analyzer shingle factory, or expert param override)");
}
}
this.summaryPre = params.get(PHRASE_SUMMARY_PRE, "{");
this.summaryPost = params.get(PHRASE_SUMMARY_POST, "}");
this.allPhrases =
Phrase.extractPhrases(
this.rawInput,
this.analysisField,
this.maxIndexedPositionLength,
this.maxQueryPositionLength);
}
/**
* Given a list of phrases to be returned to the user, summarizes those phrases by decorating
* the original input string to indicate where the identified phrases exist, using {@link
* #summaryPre} and {@link #summaryPost}
*
* @param results a list of (non overlapping) Phrases that have been identified, sorted from
* highest scoring to lowest
* @return the original user input, decorated to indicate the identified phrases
*/
public String summarize(final List results) {
final StringBuilder out = new StringBuilder(rawInput);
// sort by *reverse* position so we can go back to front
final List reversed =
results.stream()
.sorted(Comparator.comparing((p -> p.getPositionStart()), Collections.reverseOrder()))
.collect(Collectors.toList());
for (Phrase p : reversed) {
out.insert(p.getOffsetEnd(), summaryPost);
out.insert(p.getOffsetStart(), summaryPre);
}
return out.toString();
}
}
/**
* Model the data known about a single (candidate) Phrase -- which may or may not be indexed
*
* @lucene.internal
*/
public static final class Phrase {
/**
* Factory method for constructing a list of Phrases given the specified input and using the
* analyzer for the specified field. The maxIndexedPositionLength
and
* maxQueryPositionLength
provided *must* match the effective values used by respective
* analyzers.
*/
public static List extractPhrases(
final String input,
final SchemaField analysisField,
final int maxIndexedPositionLength,
final int maxQueryPositionLength) {
// TODO: rather then requiring the query analyzer to produce the Phrases for us (assuming
// Shingles)
// we could potentially just require that it produces unigrams compatible with the unigrams in
// the indexed fields, and then build our own Phrases at query time -- making the
// maxQueryPositionLength a 100% run time configuration option. But that could be tricky given
// an arbitrary analyzer -- we'd have pay careful attention to positions, and we'd have to
// guess/assume what placeholders/fillers was used in the indexed Phrases (typically shingles)
assert maxIndexedPositionLength <= maxQueryPositionLength;
final CharsRefBuilder buffer = new CharsRefBuilder();
final FieldType ft = analysisField.getType();
final Analyzer analyzer = ft.getQueryAnalyzer();
final List results = new ArrayList<>(42);
try (TokenStream tokenStream = analyzer.tokenStream(analysisField.getName(), input)) {
final OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
final PositionIncrementAttribute posIncAttr =
tokenStream.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLenAttr =
tokenStream.addAttribute(PositionLengthAttribute.class);
final TermToBytesRefAttribute termAttr =
tokenStream.addAttribute(TermToBytesRefAttribute.class);
int position = 0;
int lastPosLen = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
final Phrase phrase = new Phrase();
final int posInc = posIncAttr.getPositionIncrement();
final int posLen = posLenAttr.getPositionLength();
if (0 == posInc && posLen <= lastPosLen) {
// This requirement of analyzers to return tokens in ascending order of length
// is currently neccessary for the "linking" logic below to work
// if people run into real world sitautions where this is problematic,
// we can relax this check if we also make the linking logic more complex
// (ie: less optimzied)
throw new SolrException(
ErrorCode.BAD_REQUEST,
"Phrase identification currently requires that "
+ "the analyzer used must produce tokens that overlap in increasing order of length. ");
}
position += posInc;
lastPosLen = posLen;
phrase.position_start = position;
phrase.position_end = position + posLen;
phrase.is_indexed = (posLen <= maxIndexedPositionLength);
phrase.offset_start = offsetAttr.startOffset();
phrase.offset_end = offsetAttr.endOffset();
// populate the subsequence directly from the raw input using the offsets,
// (instead of using the TermToBytesRefAttribute) so we preserve the original
// casing, whitespace, etc...
phrase.subSequence = input.subSequence(phrase.offset_start, phrase.offset_end);
if (phrase.is_indexed) {
// populate the bytes so we can build term queries
phrase.bytes = BytesRef.deepCopyOf(termAttr.getBytesRef());
}
results.add(phrase);
}
tokenStream.end();
} catch (IOException e) {
throw new SolrException(
ErrorCode.SERVER_ERROR, "Analysis error extracting phrases from: " + input, e);
}
// fill in the relationships of each phrase
//
// NOTE: this logic currently requries that the phrases are sorted by position ascending
// (automatic because of how PositionIncrementAttribute works) then by length ascending
// (when positions are tied).
// We could de-optimize this code if we find that secondary ordering is too restrictive for
// some analyzers
//
// NOTE changes to scoring model may be allow optimize/prune down the relationships tracked,
// ...OR.... may require us to add/track more details about sub/parent phrases
//
for (int p = 0; p < results.size(); p++) {
final Phrase current = results.get(p);
if (!current.is_indexed) {
// we're not an interesting sub phrase of anything
continue;
}
// setup links from the phrase to itself if needed
addLinkages(current, current, maxIndexedPositionLength);
// scan backwards looking for phrases that might include us...
BEFORE:
for (int i = p - 1; 0 <= i; i--) {
final Phrase previous = results.get(i);
if (previous.position_start < (current.position_end - maxQueryPositionLength)) {
// we've scanned so far back nothing else is viable
break BEFORE;
}
// any 'previous' phrases must start where current starts or earlier,
// so only need to check the end...
if (current.position_end <= previous.position_end) {
addLinkages(previous, current, maxIndexedPositionLength);
}
}
// scan forwards looking for phrases that might include us...
AFTER:
for (int i = p + 1; i < results.size(); i++) {
final Phrase next = results.get(i);
// the only way a phrase that comes after current can include current is
// if they have the same start position...
if (current.position_start != next.position_start) {
// we've scanned so far forward nothing else is viable
break AFTER;
}
// any 'next' phrases must start where current starts, so only need to check the end...
if (current.position_end <= next.position_end) {
addLinkages(next, current, maxIndexedPositionLength);
}
}
}
return Collections.unmodifiableList(results);
}
/**
* Given two phrases, one of which is a super set of the other, adds the neccessary linkages
* needed by the scoring model
*/
private static void addLinkages(
final Phrase outer, final Phrase inner, final int maxIndexedPositionLength) {
assert outer.position_start <= inner.position_start;
assert inner.position_end <= outer.position_end;
assert inner.is_indexed;
final int inner_len = inner.getPositionLength();
if (1 == inner_len) {
outer.individualIndexedTerms.add(inner);
}
if (maxIndexedPositionLength == inner_len
|| (inner == outer && inner_len < maxIndexedPositionLength)) {
outer.largestIndexedSubPhrases.add(inner);
}
if (outer.is_indexed && inner != outer) {
inner.indexedSuperPhrases.add(outer);
}
}
/**
* Format the phrases suitable for returning in a shard response
*
* @see #populateStats(List,List)
*/
public static List> formatShardResponse(final List phrases) {
List> results = new ArrayList<>(phrases.size());
for (Phrase p : phrases) {
NamedList data = new SimpleOrderedMap<>();
// quick and dirty way to validate that our shards aren't using different analyzers
// so the coordinating node can fail fast when mergingthe results
data.add("checksum", p.getChecksum());
if (p.is_indexed) {
data.add("ttf", new NamedList(p.phrase_ttf));
data.add("df", new NamedList(p.phrase_df));
}
data.add("conj_dc", new NamedList(p.subTerms_conjunctionCounts));
results.add(data);
}
return results;
}
/**
* Populates the phrases with (merged) stats from a remote shard
*
* @see #formatShardResponse
*/
@SuppressWarnings({"unchecked"})
public static void populateStats(
final List phrases, final List> shardData) {
final int numPhrases = phrases.size();
if (shardData.size() != numPhrases) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"num phrases in shard data not consistent: " + numPhrases + " vs " + shardData.size());
}
for (int i = 0; i < phrases.size(); i++) {
// rather then being paranoid about the expected structure, we'll just let the low level
// code throw an NPE / CCE / AIOOBE / etc. and wrap & rethrow later...
try {
final Phrase p = phrases.get(i);
final NamedList data = shardData.get(i);
// sanity check the correct phrase
if (!p.getChecksum().equals(data.get("checksum"))) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"phrase #" + i + " in shard data had invalid checksum");
}
if (p.is_indexed) {
for (Map.Entry ttf : (NamedList) data.get("ttf")) {
p.phrase_ttf.merge(ttf.getKey(), ttf.getValue(), Long::sum);
}
for (Map.Entry df : (NamedList) data.get("df")) {
p.phrase_df.merge(df.getKey(), df.getValue(), Long::sum);
}
}
for (Map.Entry conj_dc : (NamedList) data.get("conj_dc")) {
p.subTerms_conjunctionCounts.merge(conj_dc.getKey(), conj_dc.getValue(), Long::sum);
}
} catch (RuntimeException e) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"shard data for phrase#" + i + " not consistent",
e);
}
}
}
/** Populates the phrases with stats from the local index for the specified fields */
public static void populateStats(
final List phrases,
final Collection fieldNames,
final SolrIndexSearcher searcher)
throws IOException {
final IndexReader reader = searcher.getIndexReader();
for (String field : fieldNames) {
for (Phrase phrase : phrases) {
if (phrase.is_indexed) {
// add stats based on this entire phrase as an indexed term
final Term t = new Term(field, phrase.bytes);
phrase.phrase_ttf.put(field, reader.totalTermFreq(t));
phrase.phrase_df.put(field, (long) reader.docFreq(t));
}
// even if our phrase is too long to be indexed whole, add stats based on the
// conjunction of all the individual terms in the phrase
List filters = new ArrayList<>(phrase.individualIndexedTerms.size());
for (Phrase term : phrase.individualIndexedTerms) {
// trust the SolrIndexSearcher to cache & intersect the individual terms so that this
// can be efficient regardless of how often terms are re-used multiple times in the
// input/phrases
filters.add(new TermQuery(new Term(field, term.bytes)));
}
final long count = searcher.getDocSet(filters).size();
phrase.subTerms_conjunctionCounts.put(field, count);
}
}
}
/**
* Uses the previously popuated stats to populate each Phrase with it's scores for the specified
* fields, and it's over all (weighted) total score. This is not needed on shard requests.
*
* @see #populateStats
* @see #getFieldScore(String)
* @see #getTotalScore
*/
public static void populateScores(final PhrasesContextData contextData) {
populateScores(
contextData.allPhrases,
contextData.fieldWeights,
contextData.maxIndexedPositionLength,
contextData.maxQueryPositionLength);
}
/**
* Public for testing purposes
*
* @see #populateScores(PhrasesIdentificationComponent.PhrasesContextData)
* @lucene.internal
*/
public static void populateScores(
final List phrases,
final Map fieldWeights,
final int maxIndexedPositionLength,
final int maxQueryPositionLength) {
final double total_weight =
fieldWeights.values().stream().mapToDouble(Double::doubleValue).sum();
for (Phrase phrase : phrases) {
double phrase_cumulative_score = 0.0D;
for (Map.Entry entry : fieldWeights.entrySet()) {
final String field = entry.getKey();
final double weight = entry.getValue();
double field_score =
computeFieldScore(
phrase, field,
maxIndexedPositionLength, maxQueryPositionLength);
phrase.fieldScores.put(field, field_score);
phrase_cumulative_score += (field_score * weight);
}
phrase.total_score =
(total_weight < 0
? Double.NEGATIVE_INFINITY
: (phrase_cumulative_score / total_weight));
}
}
private Phrase() {
// No-Op
}
private boolean is_indexed;
private double total_score = -1.0D; // until we get a computed score, this is "not a phrase"
private CharSequence subSequence;
private BytesRef bytes;
private int offset_start;
private int offset_end;
private int position_start;
private int position_end;
private Integer checksum = null;
/** NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves */
private final List individualIndexedTerms = new ArrayList<>(7);
/**
* NOTE: Indexed phrases of length less then the max indexed length are the (sole) largest
* sub-phrases of themselves
*/
private final List largestIndexedSubPhrases = new ArrayList<>(7);
/** Phrases larger then this phrase which are indexed and fully contain it */
private final List indexedSuperPhrases = new ArrayList<>(7);
// NOTE: keys are field names
private final Map subTerms_conjunctionCounts = new TreeMap<>();
private final Map phrase_ttf = new TreeMap<>();
private final Map phrase_df = new TreeMap<>();
private final Map fieldScores = new TreeMap<>();
@Override
public String toString() {
return "'"
+ subSequence
+ "'"
+ "["
+ offset_start
+ ":"
+ offset_end
+ "]"
+ "["
+ position_start
+ ":"
+ position_end
+ "]";
}
public NamedList getDetails() {
SimpleOrderedMap out = new SimpleOrderedMap<>();
out.add("text", subSequence);
out.add("offset_start", getOffsetStart());
out.add("offset_end", getOffsetEnd());
out.add("score", getTotalScore());
out.add("field_scores", fieldScores);
return out;
}
/**
* Computes & caches the checksum of this Phrase (if not already cached). needed only when
* merging shard data to validate no inconsistencies with the remote shards
*/
private Integer getChecksum() {
if (null == checksum) {
checksum =
Arrays.hashCode(new int[] {offset_start, offset_end, position_start, position_end});
}
return checksum;
}
/** The characters from the original input that corrispond with this Phrase */
public CharSequence getSubSequence() {
return subSequence;
}
/**
* Returns the list of "individual" (ie: getPositionLength()==1
terms. NOTE:
* Indexed phrases of length 1 are the (sole) individual terms of themselves
*/
public List getIndividualIndexedTerms() {
return individualIndexedTerms;
}
/**
* Returns the list of (overlapping) sub phrases that have the largest possible size based on
* the effective value of {@link PhrasesContextData#maxIndexedPositionLength}. NOTE: Indexed
* phrases of length less then the max indexed length are the (sole) largest sub-phrases of
* themselves.
*/
public List getLargestIndexedSubPhrases() {
return largestIndexedSubPhrases;
}
/**
* Returns all phrases larger then this phrase, which fully include this phrase, and are
* indexed. NOTE: A Phrase is never the super phrase of itself.
*/
public List getIndexedSuperPhrases() {
return indexedSuperPhrases;
}
/** NOTE: positions start at '1' */
public int getPositionStart() {
return position_start;
}
/** NOTE: positions start at '1' */
public int getPositionEnd() {
return position_end;
}
public int getPositionLength() {
return position_end - position_start;
}
/** Each set bit identifies a position filled by this Phrase */
public BitSet getPositionsBitSet() {
final BitSet result = new BitSet();
result.set(position_start, position_end);
return result;
}
public int getOffsetStart() {
return offset_start;
}
public int getOffsetEnd() {
return offset_end;
}
/**
* Returns the overall score for this Phrase. In the current implementation, the only garuntee
* made regarding the range of possible values is that 0 (or less) means it is not a good
* phrase.
*
* @return A numeric value indicating the confidence in this Phrase, higher numbers are higher
* confidence.
*/
public double getTotalScore() {
return total_score;
}
/**
* Returns the score for this Phrase in this given field. In the current implementation, the
* only garuntee made regarding the range of possible values is that 0 (or less) means it is not
* a good phrase.
*
* @return A numeric value indicating the confidence in this Phrase for this field, higher
* numbers are higher confidence.
*/
public double getFieldScore(String field) {
return fieldScores.getOrDefault(field, -1.0D);
}
/**
* Returns the number of total TTF of this (indexed) Phrase as term in the specified
* field. NOTE: behavior of calling this method is undefined unless one of the {@link
* #populateStats} methods has been called with this field.
*/
public long getTTF(String field) {
if (!is_indexed) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, "TTF is only available for indexed phrases");
}
return phrase_ttf.getOrDefault(field, 0L);
}
/**
* Returns the number of documents that contain all of the {@link
* #getIndividualIndexedTerms} that make up this Phrase, in the specified field. NOTE: behavior
* of calling this method is undefined unless one of the {@link #populateStats} methods has been
* called with this field.
*/
public long getConjunctionDocCount(String field) {
return subTerms_conjunctionCounts.getOrDefault(field, 0L);
}
/**
* Returns the number of documents that contain this (indexed) Phrase as term in the
* specified field. NOTE: behavior of calling this method is undefined unless one of the {@link
* #populateStats} methods has been called with this field.
*/
public long getDocFreq(String field) {
if (!is_indexed) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, "DF is only available for indexed phrases");
}
return phrase_df.getOrDefault(field, 0L);
}
/**
* Uses the previously popuated stats to compute a score for the specified field.
*
* The current implementation returns scores in the range of [0,1]
, but this may
* change in future implementations. The only current garuntees are:
*
*
* 0 (or less) means this is garunteed to not be a phrase
* larger numbers are higher confidence
*
* @see #populateStats
* @see #populateScores
* @see #getFieldScore(String)
* @return a score value
*/
private static double computeFieldScore(
final Phrase input,
final String field,
final int maxIndexedPositionLength,
final int maxQueryPositionLength) {
final long num_indexed_sub_phrases = input.getLargestIndexedSubPhrases().size();
assert 0 <= num_indexed_sub_phrases; // should be impossible
if (input.getIndividualIndexedTerms().size() < input.getPositionLength()) {
// there are "gaps" in our input, where individual words have not been indexed (stop words,
// or multivalue position gap) which means we are not a viable candidate for being a valid
// Phrase.
return -1.0D;
}
final long phrase_conj_count = input.getConjunctionDocCount(field);
// if there isn't a single document containing all the terms in our
// phrase, then it is 100% not a phrase
if (phrase_conj_count <= 0) {
return -1.0D;
}
// single words automatically score 0.0 (unless they already scored less for not existing
if (input.getPositionLength() <= 1) {
return 0.0D;
}
double field_score = 0.0D;
long max_sub_conj_count = phrase_conj_count;
// At the moment, the contribution of each "words" sub-Phrase to the field score to the input
// Phrase is independent of any context of "input". Depending on if/how sub-phrase scoring
// changes, we might consider computing the scores of all the indexed phrases first, and
// aching the portions of their values that are re-used when computing the scores of
// longer phrases?
//
// This would make the overall scoring of all phrases a lot more complicated,
// but could save CPU cycles?
// (particularly when maxIndexedPositionLength <<< maxQueryPositionLength ???)
//
// My gut says that knowing the conj_count(input) "context" should help us score the
// sub-phrases better, but i can't yet put my finger on why/how. maybe by comparing
// the conj_count(input) to the max(conj_count(parent of words)) ?
// for each of the longest indexed phrases, aka indexed sub-sequence of "words", we have...
for (Phrase words : input.getLargestIndexedSubPhrases()) {
// we're going to compute scores in range of [-1:1] to indicate the likelihood that our
// "words" should be used as a "phrase", based on a bayesian document categorization model,
// where the "words as a phrase" (aka: phrase) is our candidate category.
//
// P(words|phrase) * P(phrase) - P(words|not phrase) * P(not phrase)
//
// Where...
// P(words|phrase) = phrase_ttf / min(word_ttf)
// P(phrase) =~ phrase_docFreq / conj_count(words in phrase) *SEE NOTE
// BELOW*
// P(words|not phrase) = phrase_ttf / max(word_ttf)
// P(not a phrase) = 1 - P(phrase)
//
// ... BUT! ...
//
// NOTE: we're going to reduce our "P(phrase) by the max "P(phrase)" of all the (indexed)
// candidate phrases we are a sub-phrase of, to try to offset the inherent bias in favor
// of small indexed phrases -- because anytime the super-phrase exists, the sub-phrase
// exists
// IDEA: consider replacing this entire baysian model with LLR (or rootLLR)...
//
// http://mahout.apache.org/docs/0.13.0/api/docs/mahout-math/org/apache/mahout/math/stats/LogLikelihood.html
// ...where we compute LLR over each of the TTF of the pairs of adjacent sub-phrases of each
// indexed phrase and take the min|max|avg of the LLR scores.
//
// ie: for indexed shingle "quick brown fox" compute LLR(ttf("quick"), ttf("brown fox")) &
// LLR(ttf("quick brown"), ttf("fox")) using ttf("quick brown fox") as the co-occurance
// count, and sumTTF-ttf("quick")-ttf("brown")-ttf("fox") as the "something else"
//
// (we could actually compute LLR stats over TTF and DF and combine them)
//
// NOTE: Going the LLR/rootLLR route would require building a full "tree" of every (indexed)
// sub-phrase of every other phrase (or at least: all siblings of diff sizes that add up to
// an existing phrase). As well as require us to give up on a predictible "range" of
// legal values for scores (IIUC from the LLR docs)
final long phrase_ttf = words.getTTF(field);
final long phrase_df = words.getDocFreq(field);
final long words_conj_count = words.getConjunctionDocCount(field);
max_sub_conj_count = Math.max(words_conj_count, max_sub_conj_count);
final double max_wrapper_phrase_probability =
words.getIndexedSuperPhrases().stream()
.mapToDouble(
p ->
p.getConjunctionDocCount(field) <= 0
?
// special case check -- we already know *our* conj count > 0,
// but we need a similar check for wrapper phrases: if <= 0, their
// probability is 0
0.0D
: ((double) p.getDocFreq(field) / p.getConjunctionDocCount(field)))
.max()
.orElse(0.0D);
final LongSummaryStatistics words_ttfs =
words.getIndividualIndexedTerms().stream()
.collect(Collectors.summarizingLong(t -> t.getTTF(field)));
final double words_phrase_prob = (phrase_ttf / (double) words_ttfs.getMin());
final double words_not_phrase_prob = (phrase_ttf / (double) words_ttfs.getMax());
final double phrase_prob = (phrase_conj_count / (double) words_conj_count);
final double phrase_score =
words_phrase_prob * (phrase_prob - max_wrapper_phrase_probability);
final double not_phrase_score =
words_not_phrase_prob * (1 - (phrase_prob - max_wrapper_phrase_probability));
final double words_score = phrase_score - not_phrase_score;
field_score += words_score;
}
// NOTE: the "scaling" factors below can "increase" negative scores (by reducing the unsigned
// value) when they should ideally be penalizing the scores further, but since we currently
// don't care about any score lower then 0, it's not worth worrying about.
// Average the accumulated score over the number of actual indexed sub-phrases that
// contributed
//
// NOTE: since we subsequently want to multiply the score by a fraction with
// num_indexed_sub_phrases in the numerator, we can skip this... SEE BELOW // field_score /=
// (double) num_indexed_sub_phrases;
// If we leave field_score as is, then a phrase longer then the maxIndexedPositionLength will
// never score higher then the highest scoring sub-phrase it has (because we've averaged them)
// so we scale the scores against the longest possible phrase length we're considering
//
// NOTE: We don't use num_indexed_sub_phrases in the numerator since we skipped it when
// averating above...
field_score *=
(1.0D // SEE ABOVE // * ( (double)num_indexed_sub_phrases )
/ (1 + maxQueryPositionLength - maxIndexedPositionLength));
// scale the field_score based on the ratio of the conjunction docCount for the whole phrase
// realtive to the largest conjunction docCount of it's (largest indexed) sub phrases, to
// penalize the scores of very long phrases that exist very rarely relative to the how often
// their sub phrases exist in the index
field_score *= (((double) phrase_conj_count) / max_sub_conj_count);
return field_score;
}
}
/**
* Helper method, public for testing purposes only.
*
* Given an analyzer, inspects it to determine if:
*
*
* it is a {@link TokenizerChain}
* it contains exactly one instance of {@link ShingleFilterFactory}
*
*
* If these these conditions are met, then this method returns the maxShingleSize
* in effect for this analyzer, otherwise returns -1.
*
* @param analyzer An analyzer inspect
* @return maxShingleSize
if available
* @lucene.internal
*/
public static int getMaxShingleSize(Analyzer analyzer) {
if (!TokenizerChain.class.isInstance(analyzer)) {
return -1;
}
final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
if (0 == factories.length) {
return -1;
}
int result = -1;
for (TokenFilterFactory tff : factories) {
if (ShingleFilterFactory.class.isInstance(tff)) {
if (0 < result) {
// more then one shingle factory in our analyzer, which is weird, so make no
// assumptions...
return -1;
}
// would be nice if there was an easy way to just ask a factory for the effective value
// of an arguement...
final Map args = tff.getOriginalArgs();
result =
args.containsKey("maxShingleSize")
? Integer.parseInt(args.get("maxShingleSize"))
: ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
}
}
return result;
}
}