All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.component.PhrasesIdentificationComponent Maven / Gradle / Ivy

There is a newer version: 9.6.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.LongSummaryStatistics;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;

import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.util.SolrPluginUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * A component that can be used in isolation, or in conjunction with {@link QueryComponent} to identify 
 * & score "phrases" found in the input string, based on shingles in indexed fields.
 *
 * 

* The most common way to use this component is in conjunction with field that use * {@link ShingleFilterFactory} on both the index and query analyzers. * An example field type configuration would be something like this... *

*
 * <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
 *   <analyzer type="index">
 *     <tokenizer class="solr.StandardTokenizerFactory"/>
 *     <filter class="solr.LowerCaseFilterFactory"/>
 *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
 *   </analyzer>
 *   <analyzer type="query">
 *     <tokenizer class="solr.StandardTokenizerFactory"/>
 *     <filter class="solr.LowerCaseFilterFactory"/>
 *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
 *   </analyzer>
 * </fieldType>
 * 
*

* ...where the query analyzer's maxShingleSize="7" determines the maximum * possible phrase length that can be hueristically deduced, the index analyzer's * maxShingleSize="3" determines the accuracy of phrases identified. The large the * indexed maxShingleSize the higher the accuracy. Both analyzers must include * minShingleSize="2" outputUnigrams="true". *

*

* With a field type like this, one or more fields can be specified (with weights) via a * phrases.fields param to request that this component identify possible phrases in the * input q param, or an alternative phrases.q override param. The identified * phrases will include their scores relative each field specified, as well an overal weighted score based * on the field weights provided by the client. Higher score values indicate a greater confidence in the * Phrase. *

* *

* NOTE: In a distributed request, this component uses a single phase (piggy backing on the * {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use) to * collect all field & shingle stats. No "refinement" requests are used. *

* * @lucene.experimental */ public class PhrasesIdentificationComponent extends SearchComponent { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); /** The only shard purpose that will cause this component to do work & return data during shard req */ public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS; /** Name, also used as a request param to identify whether the user query concerns this component */ public static final String COMPONENT_NAME = "phrases"; // TODO: ideally these should live in a commons.params class? public static final String PHRASE_INPUT = "phrases.q"; public static final String PHRASE_FIELDS = "phrases.fields"; public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field"; public static final String PHRASE_SUMMARY_PRE = "phrases.pre"; public static final String PHRASE_SUMMARY_POST = "phrases.post"; public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index"; public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query"; @Override public void prepare(ResponseBuilder rb) throws IOException { final SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } if (params.getBool(ShardParams.IS_SHARD, false)) { // only one stage/purpose where we should do any work on a shard if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) { return; } } // if we're still here, then we should parse & validate our input, // putting it in the request context so our process method knows it should do work rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req)); } @Override public int distributedProcess(ResponseBuilder rb) { final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass()); if (null == contextData) { // if prepare didn't give us anything to work with, then we should do nothing return ResponseBuilder.STAGE_DONE; } if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) { return ResponseBuilder.STAGE_EXECUTE_QUERY; } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) { // if we're being used in conjunction with QueryComponent, it should have already created // (in this staged) the only ShardRequest we need... for (ShardRequest sreq : rb.outgoing) { if (0 != (SHARD_PURPOSE & sreq.purpose) ) { return ResponseBuilder.STAGE_GET_FIELDS; } } // ...if we can't find it, then evidently we're being used in isolation, // and we need to create our own ShardRequest... ShardRequest sreq = new ShardRequest(); sreq.purpose = SHARD_PURPOSE; sreq.params = new ModifiableSolrParams(rb.req.getParams()); sreq.params.remove(ShardParams.SHARDS); rb.addRequest(this, sreq); return ResponseBuilder.STAGE_GET_FIELDS; } else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { // NOTE: we don't do any actual work in this stage, but we need to ensure that even if // we are being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS // so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results // (w/o needing extra code paths for merging phrase results when QueryComponent is/is not used) return ResponseBuilder.STAGE_DONE; } return ResponseBuilder.STAGE_DONE; } @Override public void finishStage(ResponseBuilder rb) { // NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with // QueryComponent, we don't want to add our results to the response until *after* // QueryComponent adds the main DocList final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass()); if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) { // if prepare didn't give us anything to work with, or this isn't our stage, then do nothing return; } // sanity check: the shard requests we use/piggy-back on should only hapen once per shard, // but let's future proof ourselves against the possibility that some shards might get/respond // to the same request "purpose" multiple times... final BitSet shardsHandled = new BitSet(rb.shards.length); // Collect Shard responses for (ShardRequest sreq : rb.finished) { if (0 != (sreq.purpose & SHARD_PURPOSE)) { for (ShardResponse shardRsp : sreq.responses) { final int shardNum = rb.getShardNum(shardRsp.getShard()); if (! shardsHandled.get(shardNum)) { shardsHandled.set(shardNum); // shards.tolerant=true can cause nulls on exceptions/errors // if we don't get phrases/stats from a shard, just ignore that shard final SolrResponse rsp = shardRsp.getSolrResponse(); if (null == rsp) continue; final NamedList top = rsp.getResponse(); if (null == top) continue; @SuppressWarnings({"unchecked"}) final NamedList phrasesWrapper = (NamedList) top.get("phrases"); if (null == phrasesWrapper) continue; @SuppressWarnings({"unchecked"}) final List> shardPhrases = (List>) phrasesWrapper.get("_all"); if (null == shardPhrases) continue; Phrase.populateStats(contextData.allPhrases, shardPhrases); } } } } scoreAndAddResultsToResponse(rb, contextData); } @Override public void process(ResponseBuilder rb) throws IOException { final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass()); if (null == contextData) { // if prepare didn't give us anything to work with, then we should do nothing return; } // regardless of single node / shard, we need local stats... Phrase.populateStats(contextData.allPhrases, contextData.fieldWeights.keySet(), rb.req.getSearcher()); if ( rb.req.getParams().getBool(ShardParams.IS_SHARD, false) ) { // shard request, return stats for all phrases (in original order) SimpleOrderedMap output = new SimpleOrderedMap<>(); output.add("_all", Phrase.formatShardResponse(contextData.allPhrases)); // TODO: might want to add numDocs() & getSumTotalTermFreq(f)/getDocCount(f) stats from each field... // so that we can sum/merge them for use in scoring? rb.rsp.add("phrases", output); } else { // full single node request... scoreAndAddResultsToResponse(rb, contextData); } } /** * Helper method (suitable for both single node & distributed coordinator node) to * score, sort, and format the end user response once all phrases have been populated with stats. */ private void scoreAndAddResultsToResponse(final ResponseBuilder rb, final PhrasesContextData contextData) { assert null != contextData : "Should not be called if no phrase data to use"; if (null == contextData) { // if prepare didn't give us anything to work with, then we should do nothing return; } SimpleOrderedMap output = new SimpleOrderedMap<>(); rb.rsp.add("phrases", output); output.add("input", contextData.rawInput); if (0 == contextData.allPhrases.size()) { // w/o any phrases, the summary is just the input again... output.add("summary", contextData.rawInput); output.add("details", Collections.emptyList()); return; } Phrase.populateScores(contextData); final int maxPosition = contextData.allPhrases.get(contextData.allPhrases.size()-1).getPositionEnd(); final List validScoringPhrasesSorted = contextData.allPhrases.stream() // TODO: ideally this cut off of "0.0" should be a request option... // so users can tune how aggresive/conservative they want to be in finding phrases // but for that to be useful, we need: // - more hard & fast documentation about the "range" of scores that may be returned // - "useful" scores for single words .filter(p -> 0.0D < p.getTotalScore()) .sorted(Comparator.comparing((p -> p.getTotalScore()), Collections.reverseOrder())) .collect(Collectors.toList()); // we want to return only high scoring phrases that don't overlap w/higher scoring phrase final BitSet positionsCovered = new BitSet(maxPosition+1); final List results = new ArrayList<>(maxPosition); for (Phrase phrase : validScoringPhrasesSorted) { final BitSet phrasePositions = phrase.getPositionsBitSet(); if (! phrasePositions.intersects(positionsCovered)) { // we can use this phrase, record it... positionsCovered.or(phrasePositions); results.add(phrase); } // else: overlaps higher scoring position(s), skip this phrase if (positionsCovered.cardinality() == maxPosition+1) { // all positions are covered, so we can bail out and skip the rest break; } } // a "quick summary" of the suggested parsing output.add("summary", contextData.summarize(results)); // useful user level info on every (high scoring) phrase found (in current, descending score, order) output.add("details", results.stream() .map(p -> p.getDetails()).collect(Collectors.toList())); } @Override public String getDescription() { return "Phrases Identification Component"; } /** * Simple container for all request options and data this component needs to store in the Request Context * @lucene.internal */ public static final class PhrasesContextData { public final String rawInput; public final int maxIndexedPositionLength; public final int maxQueryPositionLength; public final Map fieldWeights; public final SchemaField analysisField; public final List allPhrases; public final String summaryPre; public final String summaryPost; // TODO: add an option to bias field weights based on sumTTF of the fields // (easy enough to "sum the sums" across multiple shards before scoring) /** * Parses the params included in this request, throwing appropriate user level * Exceptions for invalid input, and returning a PhrasesContextData * suitable for use in this request. */ public static PhrasesContextData parseAndValidateRequest(final SolrQueryRequest req) throws SolrException { return new PhrasesContextData(req); } private PhrasesContextData(final SolrQueryRequest req) throws SolrException { final SolrParams params = req.getParams(); this.rawInput = params.get(PHRASE_INPUT, params.get(CommonParams.Q)); if (null == this.rawInput) { throw new SolrException(ErrorCode.BAD_REQUEST, "phrase identification requires a query string or " + PHRASE_INPUT + " param override"); } { // field weights & analysis field... SchemaField tmpAnalysisField = null; Map tmpWeights = new TreeMap<>(); final String analysisFieldName = params.get(PHRASE_ANALYSIS_FIELD); if (null != analysisFieldName) { tmpAnalysisField = req.getSchema().getFieldOrNull(analysisFieldName); if (null == tmpAnalysisField) { throw new SolrException(ErrorCode.BAD_REQUEST, PHRASE_ANALYSIS_FIELD + " param specifies a field name that does not exist: " + analysisFieldName); } } final Map rawFields = SolrPluginUtils.parseFieldBoosts(params.getParams(PHRASE_FIELDS)); if (rawFields.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, PHRASE_FIELDS + " param must specify a (weighted) list of fields " + "to evaluate for phrase identification"); } for (Map.Entry entry : rawFields.entrySet()) { final SchemaField field = req.getSchema().getFieldOrNull(entry.getKey()); if (null == field) { throw new SolrException(ErrorCode.BAD_REQUEST, PHRASE_FIELDS + " param contains a field name that does not exist: " + entry.getKey()); } if (null == tmpAnalysisField) { tmpAnalysisField = field; } if ( null == analysisFieldName ) { if (! field.getType().equals(tmpAnalysisField.getType())) { throw new SolrException (ErrorCode.BAD_REQUEST, "All fields specified in " + PHRASE_FIELDS + " must have the same fieldType, " + "or the advanced " + PHRASE_ANALYSIS_FIELD + " option must specify an override"); } } // if a weight isn't specified, assume "1.0" final double weight = null == entry.getValue() ? 1.0D : entry.getValue(); if (weight < 0) { throw new SolrException(ErrorCode.BAD_REQUEST, PHRASE_FIELDS + " param must use non-negative weight value for field " + field.getName()); } tmpWeights.put(entry.getKey(), weight); } assert null != tmpAnalysisField; this.analysisField = tmpAnalysisField; this.fieldWeights = Collections.unmodifiableMap(tmpWeights); } { // index/query max phrase sizes... final FieldType ft = analysisField.getType(); this.maxIndexedPositionLength = req.getParams().getInt(PHRASE_INDEX_MAXLEN, getMaxShingleSize(ft.getIndexAnalyzer())); if (this.maxIndexedPositionLength < 0) { throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to determine max position length of indexed phrases using " + "index analyzer for analysis field: " + analysisField.getName() + " and no override detected using param: " + PHRASE_INDEX_MAXLEN); } this.maxQueryPositionLength = req.getParams().getInt(PHRASE_QUERY_MAXLEN, getMaxShingleSize(ft.getQueryAnalyzer())); if (this.maxQueryPositionLength < 0) { throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to determine max position length of query phrases using " + "query analyzer for analysis field: " + analysisField.getName() + " and no override detected using param: " + PHRASE_QUERY_MAXLEN); } if (this.maxQueryPositionLength < this.maxIndexedPositionLength) { throw new SolrException (ErrorCode.BAD_REQUEST, "Effective value of " + PHRASE_INDEX_MAXLEN + " (either from index analyzer shingle factory, " + " or expert param override) must be less then or equal to the effective value of " + PHRASE_QUERY_MAXLEN + " (either from query analyzer shingle factory, or expert param override)"); } } this.summaryPre = params.get(PHRASE_SUMMARY_PRE, "{"); this.summaryPost = params.get(PHRASE_SUMMARY_POST, "}"); this.allPhrases = Phrase.extractPhrases(this.rawInput, this.analysisField, this.maxIndexedPositionLength, this.maxQueryPositionLength); } /** * Given a list of phrases to be returned to the user, summarizes those phrases by decorating the * original input string to indicate where the identified phrases exist, using {@link #summaryPre} * and {@link #summaryPost} * * @param results a list of (non overlapping) Phrases that have been identified, sorted from highest scoring to lowest * @return the original user input, decorated to indicate the identified phrases */ public String summarize(final List results) { final StringBuffer out = new StringBuffer(rawInput); // sort by *reverse* position so we can go back to front final List reversed = results.stream() .sorted(Comparator.comparing((p -> p.getPositionStart()), Collections.reverseOrder())) .collect(Collectors.toList()); for (Phrase p : reversed) { out.insert(p.getOffsetEnd(), summaryPost); out.insert(p.getOffsetStart(), summaryPre); } return out.toString(); } } /** * Model the data known about a single (candidate) Phrase -- which may or may not be indexed * @lucene.internal */ public static final class Phrase { /** * Factory method for constructing a list of Phrases given the specified input and using the analyzer * for the specified field. The maxIndexedPositionLength and * maxQueryPositionLength provided *must* match the effective values used by * respective analyzers. */ public static List extractPhrases(final String input, final SchemaField analysisField, final int maxIndexedPositionLength, final int maxQueryPositionLength) { // TODO: rather then requiring the query analyzer to produce the Phrases for us (assuming Shingles) // we could potentially just require that it produces unigrams compatible with the unigrams in the // indexed fields, and then build our own Phrases at query time -- making the maxQueryPositionLength // a 100% run time configuration option. // But that could be tricky given an arbitrary analyzer -- we'd have pay careful attention // to positions, and we'd have to guess/assume what placeholders/fillers was used in the indexed Phrases // (typically shingles) assert maxIndexedPositionLength <= maxQueryPositionLength; final CharsRefBuilder buffer = new CharsRefBuilder(); final FieldType ft = analysisField.getType(); final Analyzer analyzer = ft.getQueryAnalyzer(); final List results = new ArrayList<>(42); try (TokenStream tokenStream = analyzer.tokenStream(analysisField.getName(), input)) { final OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class); final PositionIncrementAttribute posIncAttr = tokenStream.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLenAttr = tokenStream.addAttribute(PositionLengthAttribute.class); final TermToBytesRefAttribute termAttr = tokenStream.addAttribute(TermToBytesRefAttribute.class); int position = 0; int lastPosLen = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { final Phrase phrase = new Phrase(); final int posInc = posIncAttr.getPositionIncrement(); final int posLen = posLenAttr.getPositionLength(); if (0 == posInc && posLen <= lastPosLen) { // This requirement of analyzers to return tokens in ascending order of length // is currently neccessary for the "linking" logic below to work // if people run into real world sitautions where this is problematic, // we can relax this check if we also make the linking logic more complex // (ie: less optimzied) throw new SolrException (ErrorCode.BAD_REQUEST, "Phrase identification currently requires that " + "the analyzer used must produce tokens that overlap in increasing order of length. "); } position += posInc; lastPosLen = posLen; phrase.position_start = position; phrase.position_end = position + posLen; phrase.is_indexed = (posLen <= maxIndexedPositionLength); phrase.offset_start = offsetAttr.startOffset(); phrase.offset_end = offsetAttr.endOffset(); // populate the subsequence directly from the raw input using the offsets, // (instead of using the TermToBytesRefAttribute) so we preserve the original // casing, whitespace, etc... phrase.subSequence = input.subSequence(phrase.offset_start, phrase.offset_end); if (phrase.is_indexed) { // populate the bytes so we can build term queries phrase.bytes = BytesRef.deepCopyOf(termAttr.getBytesRef()); } results.add(phrase); } tokenStream.end(); } catch (IOException e) { throw new SolrException(ErrorCode.SERVER_ERROR, "Analysis error extracting phrases from: " + input, e); } // fill in the relationships of each phrase // // NOTE: this logic currently requries that the phrases are sorted by position ascending // (automatic because of how PositionIncrementAttribute works) then by length ascending // (when positions are tied). // We could de-optimize this code if we find that secondary ordering is too restrictive for // some analyzers // // NOTE changes to scoring model may be allow optimize/prune down the relationships tracked, // ...OR.... may require us to add/track more details about sub/parent phrases // for (int p = 0; p < results.size(); p++) { final Phrase current = results.get(p); if (! current.is_indexed) { // we're not an interesting sub phrase of anything continue; } // setup links from the phrase to itself if needed addLinkages(current, current, maxIndexedPositionLength); // scan backwards looking for phrases that might include us... BEFORE: for (int i = p-1; 0 <= i; i--) { final Phrase previous = results.get(i); if (previous.position_start < (current.position_end - maxQueryPositionLength)) { // we've scanned so far back nothing else is viable break BEFORE; } // any 'previous' phrases must start where current starts or earlier, // so only need to check the end... if (current.position_end <= previous.position_end) { addLinkages(previous, current, maxIndexedPositionLength); } } // scan forwards looking for phrases that might include us... AFTER: for (int i = p+1; i < results.size(); i++) { final Phrase next = results.get(i); // the only way a phrase that comes after current can include current is // if they have the same start position... if (current.position_start != next.position_start) { // we've scanned so far forward nothing else is viable break AFTER; } // any 'next' phrases must start where current starts, so only need to check the end... if (current.position_end <= next.position_end) { addLinkages(next, current, maxIndexedPositionLength); } } } return Collections.unmodifiableList(results); } /** * Given two phrases, one of which is a super set of the other, adds the neccessary linkages * needed by the scoring model */ private static void addLinkages(final Phrase outer, final Phrase inner, final int maxIndexedPositionLength) { assert outer.position_start <= inner.position_start; assert inner.position_end <= outer.position_end; assert inner.is_indexed; final int inner_len = inner.getPositionLength(); if (1 == inner_len) { outer.individualIndexedTerms.add(inner); } if (maxIndexedPositionLength == inner_len || (inner == outer && inner_len < maxIndexedPositionLength)) { outer.largestIndexedSubPhrases.add(inner); } if (outer.is_indexed && inner != outer) { inner.indexedSuperPhrases.add(outer); } } /** * Format the phrases suitable for returning in a shard response * @see #populateStats(List,List) */ public static List> formatShardResponse(final List phrases) { List> results = new ArrayList<>(phrases.size()); for (Phrase p : phrases) { NamedList data = new SimpleOrderedMap<>(); // quick and dirty way to validate that our shards aren't using different analyzers // so the coordinating node can fail fast when mergingthe results data.add("checksum", p.getChecksum()); if (p.is_indexed) { data.add("ttf", new NamedList(p.phrase_ttf)); data.add("df", new NamedList(p.phrase_df)); } data.add("conj_dc", new NamedList(p.subTerms_conjunctionCounts)); results.add(data); } return results; } /** * Populates the phrases with (merged) stats from a remote shard * @see #formatShardResponse */ @SuppressWarnings({"unchecked"}) public static void populateStats(final List phrases, final List> shardData) { final int numPhrases = phrases.size(); if (shardData.size() != numPhrases) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "num phrases in shard data not consistent: " + numPhrases + " vs " + shardData.size()); } for (int i = 0; i < phrases.size(); i++) { // rather then being paranoid about the expected structure, we'll just let the low level // code throw an NPE / CCE / AIOOBE / etc. and wrap & rethrow later... try { final Phrase p = phrases.get(i); final NamedList data = shardData.get(i); // sanity check the correct phrase if (! p.getChecksum().equals(data.get("checksum"))) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "phrase #" + i + " in shard data had invalid checksum"); } if (p.is_indexed) { for (Map.Entry ttf : (NamedList) data.get("ttf")) { p.phrase_ttf.merge(ttf.getKey(), ttf.getValue(), Long::sum); } for (Map.Entry df : (NamedList) data.get("df")) { p.phrase_df.merge(df.getKey(), df.getValue(), Long::sum); } } for (Map.Entry conj_dc : (NamedList) data.get("conj_dc")) { p.subTerms_conjunctionCounts.merge(conj_dc.getKey(), conj_dc.getValue(), Long::sum); } } catch (RuntimeException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "shard data for phrase#" + i + " not consistent", e); } } } /** * Populates the phrases with stats from the local index for the specified fields */ public static void populateStats(final List phrases, final Collection fieldNames, final SolrIndexSearcher searcher) throws IOException { final IndexReader reader = searcher.getIndexReader(); for (String field : fieldNames) { for (Phrase phrase : phrases) { if (phrase.is_indexed) { // add stats based on this entire phrase as an indexed term final Term t = new Term(field, phrase.bytes); phrase.phrase_ttf.put(field, reader.totalTermFreq(t)); phrase.phrase_df.put(field, (long)reader.docFreq(t)); } // even if our phrase is too long to be indexed whole, add stats based on the // conjunction of all the individual terms in the phrase List filters = new ArrayList<>(phrase.individualIndexedTerms.size()); for (Phrase term : phrase.individualIndexedTerms) { // trust the SolrIndexSearcher to cache & intersect the individual terms so that this // can be efficient regardless of how often terms are re-used multiple times in the input/phrases filters.add(new TermQuery(new Term(field, term.bytes))); } final long count = searcher.getDocSet(filters).size(); phrase.subTerms_conjunctionCounts.put(field, count); } } } /** * Uses the previously popuated stats to populate each Phrase with it's scores for the specified fields, * and it's over all (weighted) total score. This is not needed on shard requests. * * @see #populateStats * @see #getFieldScore(String) * @see #getTotalScore */ public static void populateScores(final PhrasesContextData contextData) { populateScores(contextData.allPhrases, contextData.fieldWeights, contextData.maxIndexedPositionLength, contextData.maxQueryPositionLength); } /** * Public for testing purposes * @see #populateScores(PhrasesIdentificationComponent.PhrasesContextData) * @lucene.internal */ public static void populateScores(final List phrases, final Map fieldWeights, final int maxIndexedPositionLength, final int maxQueryPositionLength) { final double total_weight = fieldWeights.values().stream().mapToDouble(Double::doubleValue).sum(); for (Phrase phrase : phrases) { double phrase_cumulative_score = 0.0D; for (Map.Entry entry : fieldWeights.entrySet()) { final String field = entry.getKey(); final double weight = entry.getValue(); double field_score = computeFieldScore(phrase, field, maxIndexedPositionLength, maxQueryPositionLength); phrase.fieldScores.put(field,field_score); phrase_cumulative_score += (field_score * weight); } phrase.total_score = (total_weight < 0 ? Double.NEGATIVE_INFINITY : (phrase_cumulative_score / total_weight)); } } private Phrase() { // No-Op } private boolean is_indexed; private double total_score = -1.0D; // until we get a computed score, this is "not a phrase" private CharSequence subSequence; private BytesRef bytes; private int offset_start; private int offset_end; private int position_start; private int position_end; private Integer checksum = null; /** NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves */ private final List individualIndexedTerms = new ArrayList<>(7); /** * NOTE: Indexed phrases of length less then the max indexed length are the (sole) * largest sub-phrases of themselves */ private final List largestIndexedSubPhrases = new ArrayList<>(7); /** Phrases larger then this phrase which are indexed and fully contain it */ private final List indexedSuperPhrases = new ArrayList<>(7); // NOTE: keys are field names private final Map subTerms_conjunctionCounts = new TreeMap<>(); private final Map phrase_ttf = new TreeMap<>(); private final Map phrase_df = new TreeMap<>(); private final Map fieldScores = new TreeMap<>(); public String toString() { return "'" + subSequence + "'" + "[" + offset_start + ":" + offset_end + "]" + "[" + position_start + ":" + position_end + "]"; } @SuppressWarnings({"rawtypes"}) public NamedList getDetails() { SimpleOrderedMap out = new SimpleOrderedMap(); out.add("text", subSequence); out.add("offset_start", getOffsetStart()); out.add("offset_end", getOffsetEnd()); out.add("score", getTotalScore()); out.add("field_scores", fieldScores); return out; } /** * Computes & caches the checksum of this Phrase (if not already cached). * needed only when merging shard data to validate no inconsistencies with the remote shards */ private Integer getChecksum() { if (null == checksum) { checksum = Arrays.hashCode(new int[] { offset_start, offset_end, position_start, position_end }); } return checksum; } /** The characters from the original input that corrispond with this Phrase */ public CharSequence getSubSequence() { return subSequence; } /** * Returns the list of "individual" (ie: getPositionLength()==1 terms. * NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves */ public List getIndividualIndexedTerms() { return individualIndexedTerms; } /** * Returns the list of (overlapping) sub phrases that have the largest possible size based on * the effective value of {@link PhrasesContextData#maxIndexedPositionLength}. * NOTE: Indexed phrases of length less then the max indexed length are the (sole) * largest sub-phrases of themselves. */ public List getLargestIndexedSubPhrases() { return largestIndexedSubPhrases; } /** * Returns all phrases larger then this phrase, which fully include this phrase, and are indexed. * NOTE: A Phrase is never the super phrase of itself. */ public List getIndexedSuperPhrases() { return indexedSuperPhrases; } /** NOTE: positions start at '1' */ public int getPositionStart() { return position_start; } /** NOTE: positions start at '1' */ public int getPositionEnd() { return position_end; } public int getPositionLength() { return position_end - position_start; } /** Each set bit identifies a position filled by this Phrase */ public BitSet getPositionsBitSet() { final BitSet result = new BitSet(); result.set(position_start, position_end); return result; } public int getOffsetStart() { return offset_start; } public int getOffsetEnd() { return offset_end; } /** * Returns the overall score for this Phrase. In the current implementation, * the only garuntee made regarding the range of possible values is that 0 (or less) means * it is not a good phrase. * * @return A numeric value indicating the confidence in this Phrase, higher numbers are higher confidence. */ public double getTotalScore() { return total_score; } /** * Returns the score for this Phrase in this given field. In the current implementation, * the only garuntee made regarding the range of possible values is that 0 (or less) means * it is not a good phrase. * * @return A numeric value indicating the confidence in this Phrase for this field, higher numbers are higher confidence. */ public double getFieldScore(String field) { return fieldScores.getOrDefault(field, -1.0D); } /** * Returns the number of total TTF of this (indexed) Phrase as term in the specified field. * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats} * methods has been called with this field. */ public long getTTF(String field) { if (!is_indexed) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "TTF is only available for indexed phrases"); } return phrase_ttf.getOrDefault(field, 0L); } /** * Returns the number of documents that contain all of the {@link #getIndividualIndexedTerms} * that make up this Phrase, in the specified field. * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats} * methods has been called with this field. */ public long getConjunctionDocCount(String field) { return subTerms_conjunctionCounts.getOrDefault(field, 0L); } /** * Returns the number of documents that contain this (indexed) Phrase as term * in the specified field. * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats} * methods has been called with this field. */ public long getDocFreq(String field) { if (!is_indexed) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "DF is only available for indexed phrases"); } return phrase_df.getOrDefault(field, 0L); } /** * Uses the previously popuated stats to compute a score for the specified field. * *

* The current implementation returns scores in the range of [0,1], but this * may change in future implementations. The only current garuntees are: *

* *
    *
  • 0 (or less) means this is garunteed to not be a phrase
  • *
  • larger numbers are higher confidence
  • * * * @see #populateStats * @see #populateScores * @see #getFieldScore(String) * @return a score value */ private static double computeFieldScore(final Phrase input, final String field, final int maxIndexedPositionLength, final int maxQueryPositionLength) { final long num_indexed_sub_phrases = input.getLargestIndexedSubPhrases().size(); assert 0 <= num_indexed_sub_phrases; // should be impossible if (input.getIndividualIndexedTerms().size() < input.getPositionLength()) { // there are "gaps" in our input, where individual words have not been indexed (stop words, // or multivalue position gap) which means we are not a viable candidate for being a valid Phrase. return -1.0D; } final long phrase_conj_count = input.getConjunctionDocCount(field); // if there isn't a single document containing all the terms in our // phrase, then it is 100% not a phrase if (phrase_conj_count <= 0) { return -1.0D; } // single words automatically score 0.0 (unless they already scored less for not existing if (input.getPositionLength() <= 1) { return 0.0D; } double field_score = 0.0D; long max_sub_conj_count = phrase_conj_count; // At the moment, the contribution of each "words" sub-Phrase to the field score to the input // Phrase is independent of any context of "input". Depending on if/how sub-phrase scoring // changes, we might consider computing the scores of all the indexed phrases first, and // aching the portions of their values that are re-used when computing the scores of // longer phrases? // // This would make the overall scoring of all phrases a lot more complicated, // but could save CPU cycles? // (particularly when maxIndexedPositionLength <<< maxQueryPositionLength ???) // // My gut says that knowing the conj_count(input) "context" should help us score the // sub-phrases better, but i can't yet put my finger on why/how. maybe by comparing // the conj_count(input) to the max(conj_count(parent of words)) ? // for each of the longest indexed phrases, aka indexed sub-sequence of "words", we have... for (Phrase words : input.getLargestIndexedSubPhrases()) { // we're going to compute scores in range of [-1:1] to indicate the likelihood that our // "words" should be used as a "phrase", based on a bayesian document categorization model, // where the "words as a phrase" (aka: phrase) is our candidate category. // // P(words|phrase) * P(phrase) - P(words|not phrase) * P(not phrase) // // Where... // P(words|phrase) = phrase_ttf / min(word_ttf) // P(phrase) =~ phrase_docFreq / conj_count(words in phrase) *SEE NOTE BELOW* // P(words|not phrase) = phrase_ttf / max(word_ttf) // P(not a phrase) = 1 - P(phrase) // // ... BUT! ... // // NOTE: we're going to reduce our "P(phrase) by the max "P(phrase)" of all the (indexed) // candidate phrases we are a sub-phrase of, to try to offset the inherent bias in favor // of small indexed phrases -- because anytime the super-phrase exists, the sub-phrase exists // IDEA: consider replacing this entire baysian model with LLR (or rootLLR)... // http://mahout.apache.org/docs/0.13.0/api/docs/mahout-math/org/apache/mahout/math/stats/LogLikelihood.html // ...where we compute LLR over each of the TTF of the pairs of adjacent sub-phrases of each // indexed phrase and take the min|max|avg of the LLR scores. // // ie: for indexed shingle "quick brown fox" compute LLR(ttf("quick"), ttf("brown fox")) & // LLR(ttf("quick brown"), ttf("fox")) using ttf("quick brown fox") as the co-occurance // count, and sumTTF-ttf("quick")-ttf("brown")-ttf("fox") as the "something else" // // (we could actually compute LLR stats over TTF and DF and combine them) // // NOTE: Going the LLR/rootLLR route would require building a full "tree" of every (indexed) // sub-phrase of every other phrase (or at least: all siblings of diff sizes that add up to // an existing phrase). As well as require us to give up on a predictible "range" of // legal values for scores (IIUC from the LLR docs) final long phrase_ttf = words.getTTF(field); final long phrase_df = words.getDocFreq(field); final long words_conj_count = words.getConjunctionDocCount(field); max_sub_conj_count = Math.max(words_conj_count, max_sub_conj_count); final double max_wrapper_phrase_probability = words.getIndexedSuperPhrases().stream() .mapToDouble(p -> p.getConjunctionDocCount(field) <= 0 ? // special case check -- we already know *our* conj count > 0, // but we need a similar check for wrapper phrases: if <= 0, their probability is 0 0.0D : ((double)p.getDocFreq(field) / p.getConjunctionDocCount(field))).max().orElse(0.0D); final LongSummaryStatistics words_ttfs = words.getIndividualIndexedTerms().stream() .collect(Collectors.summarizingLong(t -> t.getTTF(field))); final double words_phrase_prob = (phrase_ttf / (double)words_ttfs.getMin()); final double words_not_phrase_prob = (phrase_ttf / (double)words_ttfs.getMax()); final double phrase_prob = (phrase_conj_count / (double)words_conj_count); final double phrase_score = words_phrase_prob * (phrase_prob - max_wrapper_phrase_probability); final double not_phrase_score = words_not_phrase_prob * (1 - (phrase_prob - max_wrapper_phrase_probability)); final double words_score = phrase_score - not_phrase_score; field_score += words_score; } // NOTE: the "scaling" factors below can "increase" negative scores (by reducing the unsigned value) // when they should ideally be penalizing the scores further, but since we currently don't care // about any score lower then 0, it's not worth worrying about. // Average the accumulated score over the number of actual indexed sub-phrases that contributed // // NOTE: since we subsequently want to multiply the score by a fraction with num_indexed_sub_phrases // in the numerator, we can skip this... // SEE BELOW // field_score /= (double) num_indexed_sub_phrases; // If we leave field_score as is, then a phrase longer then the maxIndexedPositionLength // will never score higher then the highest scoring sub-phrase it has (because we've averaged them) // so we scale the scores against the longest possible phrase length we're considering // // NOTE: We don't use num_indexed_sub_phrases in the numerator since we skipped it when // averating above... field_score *= ( 1.0D // SEE ABOVE // * ( (double)num_indexed_sub_phrases ) / (1 + maxQueryPositionLength - maxIndexedPositionLength) ); // scale the field_score based on the ratio of the conjunction docCount for the whole phrase // realtive to the largest conjunction docCount of it's (largest indexed) sub phrases, to penalize // the scores of very long phrases that exist very rarely relative to the how often their // sub phrases exist in the index field_score *= ( ((double) phrase_conj_count) / max_sub_conj_count); return field_score; } } /** * Helper method, public for testing purposes only. *

    * Given an analyzer, inspects it to determine if: *

      *
    • it is a {@link TokenizerChain}
    • *
    • it contains exactly one instance of {@link ShingleFilterFactory}
    • *
    *

    * If these these conditions are met, then this method returns the maxShingleSize * in effect for this analyzer, otherwise returns -1. *

    * * @param analyzer An analyzer inspect * @return maxShingleSize if available * @lucene.internal */ public static int getMaxShingleSize(Analyzer analyzer) { if (!TokenizerChain.class.isInstance(analyzer)) { return -1; } final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories(); if (0 == factories.length) { return -1; } int result = -1; for (TokenFilterFactory tff : factories) { if (ShingleFilterFactory.class.isInstance(tff)) { if (0 < result) { // more then one shingle factory in our analyzer, which is weird, so make no assumptions... return -1; } // would be nice if there was an easy way to just ask a factory for the effective value // of an arguement... final Map args = tff.getOriginalArgs(); result = args.containsKey("maxShingleSize") ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; } } return result; } }