org.lumongo.server.index.analysis.TermFreq Maven / Gradle / Ivy
The newest version!
package org.lumongo.server.index.analysis;
import com.google.common.collect.Ordering;
import org.lumongo.cluster.message.Lumongo;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
/**
* Created by mdavis on 6/28/16.
*/
public class TermFreq {
private final DocFreq docFreq;
private final HashMap tokenCount;
private List terms;
public TermFreq(DocFreq docFreq) {
this.docFreq = docFreq;
this.tokenCount = new HashMap<>();
}
public void addTerm(String term) throws IOException {
Lumongo.Term.Builder lmTerm = tokenCount.get(term);
if (lmTerm == null) {
lmTerm = Lumongo.Term.newBuilder().setTermFreq(0).setValue(term);
tokenCount.put(term, lmTerm);
if (docFreq != null) {
int docFreq = this.docFreq.getDocFreq(term);
lmTerm.setDocFreq(docFreq);
}
}
lmTerm.setTermFreq(lmTerm.getTermFreq() + 1);
}
public void addTerm(Lumongo.Term.Builder term) {
Lumongo.Term.Builder lmTerm = tokenCount.get(term.getValue());
if (lmTerm == null) {
lmTerm = Lumongo.Term.newBuilder(term.buildPartial());
tokenCount.put(term.getValue(), lmTerm);
}
else {
lmTerm.setTermFreq(lmTerm.getTermFreq() + term.getTermFreq());
}
}
public List getTopTerms(int topN, Lumongo.AnalysisRequest.TermSort termSort) {
if (terms == null) {
terms = new ArrayList<>(tokenCount.values());
}
if (Lumongo.AnalysisRequest.TermSort.TFIDF.equals(termSort)) {
if (docFreq != null) {
for (Lumongo.Term.Builder term : terms) {
double score = docFreq.getScoreForTerm(term.getTermFreq(), term.getDocFreq());
term.setScore(score);
}
}
}
return getTopTerms(terms, topN, termSort);
}
public static List getTopTerms(List terms, int n, Lumongo.AnalysisRequest.TermSort termSort) {
Comparator ordering = (Lumongo.Term.Builder o1, Lumongo.Term.Builder o2) -> {
if (Lumongo.AnalysisRequest.TermSort.TF.equals(termSort)) {
return Long.compare(o1.getTermFreq(), o2.getTermFreq());
}
else if (Lumongo.AnalysisRequest.TermSort.TFIDF.equals(termSort)) {
return Double.compare(o1.getScore(), o2.getScore());
}
else if (Lumongo.AnalysisRequest.TermSort.ABC.equals(termSort)) {
return o2.getValue().compareTo(o1.getValue());
}
else {
return 0;
}
};
if (n != 0) {
//seems to be the most efficient according to
//http://www.michaelpollmeier.com/selecting-top-k-items-from-a-list-efficiently-in-java-groovy/
return Ordering.from(ordering).greatestOf(terms, n);
}
else {
Collections.sort(terms, ordering.reversed());
return terms;
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy