org.elasticsearch.action.termvectors.TermVectorsFilter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.action.termvectors;
import com.google.common.util.concurrent.AtomicLongMap;
import org.apache.lucene.index.*;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public class TermVectorsFilter {
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
public static final int DEFAULT_MIN_TERM_FREQ = 0;
public static final int DEFAULT_MAX_TERM_FREQ = Integer.MAX_VALUE;
public static final int DEFAULT_MIN_DOC_FREQ = 0;
public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
private int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int maxTermFreq = DEFAULT_MAX_TERM_FREQ;
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
private Fields fields;
private Fields topLevelFields;
private final Set selectedFields;
private AggregatedDfs dfs;
private Map scoreTerms;
private AtomicLongMap sizes;
private TFIDFSimilarity similarity;
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set selectedFields, @Nullable AggregatedDfs dfs) {
this.fields = termVectorsByField;
this.topLevelFields = topLevelFields;
this.selectedFields = selectedFields;
this.dfs = dfs;
this.scoreTerms = new HashMap<>();
this.sizes = AtomicLongMap.create();
this.similarity = new DefaultSimilarity();
}
public void setSettings(TermVectorsRequest.FilterSettings settings) {
if (settings.maxNumTerms != null) {
setMaxNumTerms(settings.maxNumTerms);
}
if (settings.minTermFreq != null) {
setMinTermFreq(settings.minTermFreq);
}
if (settings.maxTermFreq != null) {
setMaxTermFreq(settings.maxTermFreq);
}
if (settings.minDocFreq != null) {
setMinDocFreq(settings.minDocFreq);
}
if (settings.maxDocFreq != null) {
setMaxDocFreq(settings.maxDocFreq);
}
if (settings.minWordLength != null) {
setMinWordLength(settings.minWordLength);
}
if (settings.maxWordLength != null) {
setMaxWordLength(settings.maxWordLength);
}
}
public ScoreTerm getScoreTerm(Term term) {
return scoreTerms.get(term);
}
public boolean hasScoreTerm(Term term) {
return getScoreTerm(term) != null;
}
public long size(String fieldName) {
return sizes.get(fieldName);
}
public int getMaxNumTerms() {
return maxNumTerms;
}
public int getMinTermFreq() {
return minTermFreq;
}
public int getMaxTermFreq() {
return maxTermFreq;
}
public int getMinDocFreq() {
return minDocFreq;
}
public int getMaxDocFreq() {
return maxDocFreq;
}
public int getMinWordLength() {
return minWordLength;
}
public int getMaxWordLength() {
return maxWordLength;
}
public void setMaxNumTerms(int maxNumTerms) {
this.maxNumTerms = maxNumTerms;
}
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public void setMaxTermFreq(int maxTermFreq) {
this.maxTermFreq = maxTermFreq;
}
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
public void setMaxDocFreq(int maxDocFreq) {
this.maxDocFreq = maxDocFreq;
}
public void setMinWordLength(int minWordLength) {
this.minWordLength = minWordLength;
}
public void setMaxWordLength(int maxWordLength) {
this.maxWordLength = maxWordLength;
}
public static final class ScoreTerm {
public String field;
public String word;
public float score;
ScoreTerm(String field, String word, float score) {
this.field = field;
this.word = word;
this.score = score;
}
void update(String field, String word, float score) {
this.field = field;
this.word = word;
this.score = score;
}
}
public void selectBestTerms() throws IOException {
PostingsEnum docsEnum = null;
for (String fieldName : fields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = fields.terms(fieldName);
Terms topLevelTerms = topLevelFields.terms(fieldName);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = terms;
}
long numDocs = getDocCount(fieldName, topLevelTerms);
// one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
// select terms with highest tf-idf
TermsEnum termsEnum = terms.iterator();
TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
while (termsEnum.next() != null) {
BytesRef termBytesRef = termsEnum.term();
boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
Term term = new Term(fieldName, termBytesRef);
// remove noise words
int freq = getTermFreq(termsEnum, docsEnum);
if (isNoise(term.bytes().utf8ToString(), freq)) {
continue;
}
// now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
if (!isAccepted(docFreq)) {
continue;
}
// filter based on score
float score = computeScore(docFreq, freq, numDocs);
queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
}
// retain the best terms for quick lookups
ScoreTerm scoreTerm;
while ((scoreTerm = queue.pop()) != null) {
scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
sizes.incrementAndGet(scoreTerm.field);
}
}
}
private boolean isNoise(String word, int freq) {
// filter out words based on length
int len = word.length();
if (minWordLength > 0 && len < minWordLength) {
return true;
}
if (maxWordLength > 0 && len > maxWordLength) {
return true;
}
// filter out words that don't occur enough times in the source
if (minTermFreq > 0 && freq < minTermFreq) {
return true;
}
// filter out words that occur too many times in the source
if (freq > maxTermFreq) {
return true;
}
return false;
}
private boolean isAccepted(long docFreq) {
// filter out words that don't occur in enough docs
if (minDocFreq > 0 && docFreq < minDocFreq) {
return false;
}
// filter out words that occur in too many docs
if (docFreq > maxDocFreq) {
return false;
}
// index update problem?
if (docFreq == 0) {
return false;
}
return true;
}
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
if (dfs != null) {
return dfs.fieldStatistics().get(fieldName).docCount();
}
return topLevelTerms.getDocCount();
}
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
if (dfs != null) {
return dfs.termStatistics().get(term);
}
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
return docsEnum.freq();
}
private float computeScore(long docFreq, int freq, long numDocs) {
return freq * similarity.idf(docFreq, numDocs);
}
private static class ScoreTermsQueue extends org.apache.lucene.util.PriorityQueue {
private final int limit;
ScoreTermsQueue(int maxSize) {
super(maxSize);
this.limit = maxSize;
}
@Override
protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
return a.score < b.score;
}
public void addOrUpdate(ScoreTerm scoreTerm) {
if (this.size() < limit) {
// there is still space in the queue
this.add(scoreTerm);
} else {
// otherwise update the smallest in the queue in place and update the queue
ScoreTerm scoreTermTop = this.top();
if (scoreTermTop.score < scoreTerm.score) {
scoreTermTop.update(scoreTerm.field, scoreTerm.word, scoreTerm.score);
this.updateTop();
}
}
}
}
}