All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.sandbox.search.TermAutomatonQuery Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.sandbox.search;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.spans.SpanNearQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafSimScorer;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

// TODO
//    - compare perf to PhraseQuery exact and sloppy
//    - optimize: find terms that are in fact MUST (because all paths
//      through the A include that term)
//    - if we ever store posLength in the index, it would be easy[ish]
//      to take it into account here

/**
 * A proximity query that lets you express an automaton, whose transitions are terms, to match
 * documents. This is a generalization of other proximity queries like {@link PhraseQuery}, {@link
 * MultiPhraseQuery} and {@link SpanNearQuery}. It is likely slow, since it visits any document
 * having any of the terms (i.e. it acts like a disjunction, not a conjunction like {@link
 * PhraseQuery}), and then it must merge-sort all positions within each document to test whether/how
 * many times the automaton matches.
 *
 * 

After creating the query, use {@link #createState}, {@link #setAccept}, {@link #addTransition} * and {@link #addAnyTransition} to build up the automaton. Once you are done, call {@link #finish} * and then execute the query. * *

This code is very new and likely has exciting bugs! * * @lucene.experimental */ public class TermAutomatonQuery extends Query implements Accountable { private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(TermAutomatonQuery.class); private final String field; private final Automaton.Builder builder; Automaton det; private final Map termToID = new HashMap<>(); private final Map idToTerm = new HashMap<>(); private int anyTermID = -1; public TermAutomatonQuery(String field) { this.field = field; this.builder = new Automaton.Builder(); } /** Returns a new state; state 0 is always the initial state. */ public int createState() { return builder.createState(); } /** Marks the specified state as accept or not. */ public void setAccept(int state, boolean accept) { builder.setAccept(state, accept); } /** Adds a transition to the automaton. */ public void addTransition(int source, int dest, String term) { addTransition(source, dest, new BytesRef(term)); } /** Adds a transition to the automaton. */ public void addTransition(int source, int dest, BytesRef term) { if (term == null) { throw new NullPointerException("term should not be null"); } builder.addTransition(source, dest, getTermID(term)); } /** Adds a transition matching any term. */ public void addAnyTransition(int source, int dest) { builder.addTransition(source, dest, getTermID(null)); } /** Call this once you are done adding states/transitions. */ public void finish() { finish(DEFAULT_DETERMINIZE_WORK_LIMIT); } /** * Call this once you are done adding states/transitions. * * @param determinizeWorkLimit Maximum effort to spend determinizing the automaton. Higher numbers * allow this operation to consume more memory but allow more complex automatons. Use {@link * Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't otherwise know * what to specify. */ public void finish(int determinizeWorkLimit) { Automaton automaton = builder.finish(); // System.out.println("before det:\n" + automaton.toDot()); Transition t = new Transition(); // TODO: should we add "eps back to initial node" for all states, // and det that? then we don't need to revisit initial node at // every position? but automaton could blow up? And, this makes it // harder to skip useless positions at search time? if (anyTermID != -1) { // Make sure there are no leading or trailing ANY: int count = automaton.initTransition(0, t); for (int i = 0; i < count; i++) { automaton.getNextTransition(t); if (anyTermID >= t.min && anyTermID <= t.max) { throw new IllegalStateException("automaton cannot lead with an ANY transition"); } } int numStates = automaton.getNumStates(); for (int i = 0; i < numStates; i++) { count = automaton.initTransition(i, t); for (int j = 0; j < count; j++) { automaton.getNextTransition(t); if (automaton.isAccept(t.dest) && anyTermID >= t.min && anyTermID <= t.max) { throw new IllegalStateException("automaton cannot end with an ANY transition"); } } } int termCount = termToID.size(); // We have to carefully translate these transitions so automaton // realizes they also match all other terms: Automaton newAutomaton = new Automaton(); for (int i = 0; i < numStates; i++) { newAutomaton.createState(); newAutomaton.setAccept(i, automaton.isAccept(i)); } for (int i = 0; i < numStates; i++) { count = automaton.initTransition(i, t); for (int j = 0; j < count; j++) { automaton.getNextTransition(t); int min, max; if (t.min <= anyTermID && anyTermID <= t.max) { // Match any term min = 0; max = termCount - 1; } else { min = t.min; max = t.max; } newAutomaton.addTransition(t.source, t.dest, min, max); } } newAutomaton.finishState(); automaton = newAutomaton; } det = Operations.removeDeadStates(Operations.determinize(automaton, determinizeWorkLimit)); if (det.isAccept(0)) { throw new IllegalStateException("cannot accept the empty string"); } } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { IndexReaderContext context = searcher.getTopReaderContext(); Map termStates = new HashMap<>(); for (Map.Entry ent : termToID.entrySet()) { if (ent.getKey() != null) { termStates.put( ent.getValue(), TermStates.build(context, new Term(field, ent.getKey()), scoreMode.needsScores())); } } return new TermAutomatonWeight(det, searcher, termStates, boost); } @Override public String toString(String field) { // TODO: what really am I supposed to do with the incoming field... StringBuilder sb = new StringBuilder(); sb.append("TermAutomatonQuery(field="); sb.append(this.field); if (det != null) { sb.append(" numStates="); sb.append(det.getNumStates()); } sb.append(')'); return sb.toString(); } private int getTermID(BytesRef term) { Integer id = termToID.get(term); if (id == null) { id = termToID.size(); if (term != null) { term = BytesRef.deepCopyOf(term); } termToID.put(term, id); idToTerm.put(id, term); if (term == null) { anyTermID = id; } } return id; } /** Returns true iff o is equal to this. */ @Override public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } private static boolean checkFinished(TermAutomatonQuery q) { if (q.det == null) { throw new IllegalStateException("Call finish first on: " + q); } return true; } private boolean equalsTo(TermAutomatonQuery other) { return checkFinished(this) && checkFinished(other) && other == this; } @Override public int hashCode() { checkFinished(this); // LUCENE-7295: this used to be very awkward toDot() call; it is safer to assume // that no two instances are equivalent instead (until somebody finds a better way to check // on automaton equivalence quickly). return System.identityHashCode(this); } @Override public long ramBytesUsed() { return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(builder) + RamUsageEstimator.sizeOfObject(det) + RamUsageEstimator.sizeOfObject(field) + RamUsageEstimator.sizeOfObject(idToTerm) + RamUsageEstimator.sizeOfObject(termToID); } /** * Returns the dot (graphviz) representation of this automaton. This is extremely useful for * visualizing the automaton. */ public String toDot() { // TODO: refactor & share with Automaton.toDot! StringBuilder b = new StringBuilder(); b.append("digraph Automaton {\n"); b.append(" rankdir = LR\n"); final int numStates = det.getNumStates(); if (numStates > 0) { b.append(" initial [shape=plaintext,label=\"0\"]\n"); b.append(" initial -> 0\n"); } Transition t = new Transition(); for (int state = 0; state < numStates; state++) { b.append(" "); b.append(state); if (det.isAccept(state)) { b.append(" [shape=doublecircle,label=\"").append(state).append("\"]\n"); } else { b.append(" [shape=circle,label=\"").append(state).append("\"]\n"); } int numTransitions = det.initTransition(state, t); for (int i = 0; i < numTransitions; i++) { det.getNextTransition(t); assert t.max >= t.min; for (int j = t.min; j <= t.max; j++) { b.append(" "); b.append(state); b.append(" -> "); b.append(t.dest); b.append(" [label=\""); if (j == anyTermID) { b.append('*'); } else { b.append(idToTerm.get(j).utf8ToString()); } b.append("\"]\n"); } } } b.append('}'); return b.toString(); } // TODO: should we impl rewrite to return BooleanQuery of PhraseQuery, // when 1) automaton is finite, 2) doesn't use ANY transition, 3) is // "small enough"? static class EnumAndScorer { public final int termID; public final PostingsEnum posEnum; // How many positions left in the current document: public int posLeft; // Current position public int pos; public EnumAndScorer(int termID, PostingsEnum posEnum) { this.termID = termID; this.posEnum = posEnum; } } final class TermAutomatonWeight extends Weight { final Automaton automaton; private final Map termStates; private final Similarity.SimScorer stats; private final Similarity similarity; public TermAutomatonWeight( Automaton automaton, IndexSearcher searcher, Map termStates, float boost) throws IOException { super(TermAutomatonQuery.this); this.automaton = automaton; this.termStates = termStates; this.similarity = searcher.getSimilarity(); List allTermStats = new ArrayList<>(); for (Map.Entry ent : idToTerm.entrySet()) { Integer termID = ent.getKey(); if (ent.getValue() != null) { TermStates ts = termStates.get(termID); if (ts.docFreq() > 0) { allTermStats.add( searcher.termStatistics( new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq())); } } } if (allTermStats.isEmpty()) { stats = null; // no terms matched at all, will not use sim } else { stats = similarity.scorer( boost, searcher.collectionStatistics(field), allTermStats.toArray(new TermStatistics[allTermStats.size()])); } } @Override public String toString() { return "weight(" + TermAutomatonQuery.this + ")"; } @Override public Scorer scorer(LeafReaderContext context) throws IOException { // Initialize the enums; null for a given slot means that term didn't appear in this reader EnumAndScorer[] enums = new EnumAndScorer[idToTerm.size()]; boolean any = false; for (Map.Entry ent : termStates.entrySet()) { TermStates termStates = ent.getValue(); assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); BytesRef term = idToTerm.get(ent.getKey()); TermState state = termStates.get(context); if (state != null) { TermsEnum termsEnum = context.reader().terms(field).iterator(); termsEnum.seekExact(term, state); enums[ent.getKey()] = new EnumAndScorer(ent.getKey(), termsEnum.postings(null, PostingsEnum.POSITIONS)); any = true; } } if (any) { return new TermAutomatonScorer( this, enums, anyTermID, new LeafSimScorer(stats, context.reader(), field, true)); } else { return null; } } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { // TODO return null; } } @Override public Query rewrite(IndexReader reader) throws IOException { if (Operations.isEmpty(det)) { return new MatchNoDocsQuery(); } IntsRef single = Operations.getSingleton(det); if (single != null && single.length == 1) { return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset]))); } // TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we // even have MultiPhraseQuery? // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a // sausage: MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder(); PhraseQuery.Builder pq = new PhraseQuery.Builder(); Transition t = new Transition(); int state = 0; int pos = 0; query: while (true) { int count = det.initTransition(state, t); if (count == 0) { if (det.isAccept(state) == false) { mpq = null; pq = null; } break; } else if (det.isAccept(state)) { mpq = null; pq = null; break; } int dest = -1; List terms = new ArrayList<>(); boolean matchesAny = false; for (int i = 0; i < count; i++) { det.getNextTransition(t); if (i == 0) { dest = t.dest; } else if (dest != t.dest) { mpq = null; pq = null; break query; } matchesAny |= anyTermID >= t.min && anyTermID <= t.max; if (matchesAny == false) { for (int termID = t.min; termID <= t.max; termID++) { terms.add(new Term(field, idToTerm.get(termID))); } } } if (matchesAny == false) { mpq.add(terms.toArray(new Term[terms.size()]), pos); if (pq != null) { if (terms.size() == 1) { pq.add(terms.get(0), pos); } else { pq = null; } } } state = dest; pos++; } if (pq != null) { return pq.build(); } else if (mpq != null) { return mpq.build(); } // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's // "worth it"? return this; } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { return; } QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this); for (BytesRef term : termToID.keySet()) { v.consumeTerms(this, new Term(field, term)); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy