com.o19s.es.explore.ExplorerQuery Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-learning-to-rank Show documentation
Show all versions of elasticsearch-learning-to-rank Show documentation
Learing to Rank Query w/ RankLib Models
/*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.o19s.es.explore;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import java.io.IOException;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
public class ExplorerQuery extends Query {
private final Query query;
private final String type;
public ExplorerQuery(Query query, String type) {
this.query = query;
this.type = type;
}
private boolean isCollectionScoped() {
return type.endsWith("_count")
|| type.endsWith("_df")
|| type.endsWith("_idf")
|| type.endsWith(("_ttf"));
}
public Query getQuery() { return this.query; }
public String getType() { return this.type; }
@Override
public boolean equals(Object other) {
return sameClassAs(other) &&
equalsTo(getClass().cast(other));
}
private boolean equalsTo(ExplorerQuery other) {
return Objects.equals(query, other.query)
&& Objects.equals(type, other.type);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = query.rewrite(reader);
if(rewritten != query) {
return new ExplorerQuery(rewritten, type);
}
return this;
}
@Override
public int hashCode() {
return Objects.hash(query, type);
}
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
if (!needsScores) {
return searcher.createWeight(query, false, boost);
}
final Weight subWeight = searcher.createWeight(query, true, boost);
Set terms = new HashSet<>();
subWeight.extractTerms(terms);
if(isCollectionScoped()) {
ClassicSimilarity sim = new ClassicSimilarity();
StatisticsHelper df_stats = new StatisticsHelper();
StatisticsHelper idf_stats = new StatisticsHelper();
StatisticsHelper ttf_stats = new StatisticsHelper();
for(Term term : terms) {
TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term);
TermStatistics tStats = searcher.termStatistics(term, ctx);
df_stats.add(tStats.docFreq());
idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs()));
ttf_stats.add(tStats.totalTermFreq());
}
/*
If no terms are parsed in the query we opt for returning 0
instead of throwing an exception that could break various
pipelines.
*/
float constantScore;
if(terms.size() > 0) {
switch (type) {
case ("sum_classic_idf"):
constantScore = idf_stats.getSum();
break;
case ("mean_classic_idf"):
constantScore = idf_stats.getMean();
break;
case ("max_classic_idf"):
constantScore = idf_stats.getMax();
break;
case ("min_classic_idf"):
constantScore = idf_stats.getMin();
break;
case ("stddev_classic_idf"):
constantScore = idf_stats.getStdDev();
break;
case "sum_raw_df":
constantScore = df_stats.getSum();
break;
case "min_raw_df":
constantScore = df_stats.getMin();
break;
case "max_raw_df":
constantScore = df_stats.getMax();
break;
case "mean_raw_df":
constantScore = df_stats.getMean();
break;
case "stddev_raw_df":
constantScore = df_stats.getStdDev();
break;
case "sum_raw_ttf":
constantScore = ttf_stats.getSum();
break;
case "min_raw_ttf":
constantScore = ttf_stats.getMin();
break;
case "max_raw_ttf":
constantScore = ttf_stats.getMax();
break;
case "mean_raw_ttf":
constantScore = ttf_stats.getMean();
break;
case "stddev_raw_ttf":
constantScore = ttf_stats.getStdDev();
break;
case "unique_terms_count":
constantScore = terms.size();
break;
default:
throw new RuntimeException("Invalid stat type specified.");
}
} else {
constantScore = 0.0f;
}
return new ConstantScoreWeight(ExplorerQuery.this, constantScore) {
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context);
int newDoc = scorer.iterator().advance(doc);
assert newDoc == doc; // this is a DocIdSetIterator.all
return Explanation.match(
scorer.score(),
"Stat Score: " + type);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return new ConstantScoreScorer(this, constantScore, DocIdSetIterator.all(context.reader().maxDoc()));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
};
} else if (type.endsWith("_raw_tf")) {
// Rewrite this into a boolean query where we can inject our PostingsExplorerQuery
BooleanQuery.Builder qb = new BooleanQuery.Builder();
for (Term t : terms) {
qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF), BooleanClause.Occur.SHOULD));
}
// FIXME: completely refactor this class and stop accepting a random query but a list of terms directly
// rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms
// problem is that we rely on extractTerms which happen too late in the process
Query q = qb.build().rewrite(searcher.getIndexReader());
return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type);
}
throw new IllegalArgumentException( "Unknown ExplorerQuery type [" + type + "]" );
}
static class ExplorerWeight extends Weight {
protected final Weight weight;
private final String type;
ExplorerWeight(Query q, Weight subWeight, String type) throws IOException {
super(q);
weight = subWeight;
this.type = type;
}
@Override
public void extractTerms(Set terms) {
weight.extractTerms(terms);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context);
if (scorer != null) {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
return Explanation.match(
scorer.score(),
"Stat Score: " + type);
}
}
return Explanation.noMatch("no matching term");
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return this.weight.isCacheable(ctx);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Scorer subscorer = weight.scorer(context);
return new ExplorerScorer(weight, type, subscorer);
}
}
public String toString(String field) {
return query.toString();
};
}