All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.lumongo.server.index.analysis.DocFreq Maven / Gradle / Ivy

The newest version!
package org.lumongo.server.index.analysis;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;

import java.io.IOException;
import java.util.HashMap;

/**
 * Created by Matt Davis on 6/28/16.
 * @author mdavis
 */
public class DocFreq {

	private final HashMap docFreqMap;
	private final IndexReader indexReader;
	private final String field;
	private final TFIDFSimilarity similarity;
	private final int numDocs;

	public DocFreq(IndexReader indexReader, String field) {
		this.indexReader = indexReader;
		this.field = field;
		this.docFreqMap = new HashMap<>();
		this.similarity = new ClassicSimilarity();
		this.numDocs = indexReader.numDocs();
	}

	public int getDocFreq(String term) throws IOException {
		Integer termDocFreq = this.docFreqMap.get(term);
		if (termDocFreq == null) {
			termDocFreq = indexReader.docFreq(new Term(field, term));
			docFreqMap.put(term, termDocFreq);
		}

		return termDocFreq;

	}

	public double getScoreForTerm(long termFreq, long docFreq) {
		return similarity.tf(termFreq) * similarity.idf(docFreq, numDocs);
	}


	public int getNumDocsForPercent(float percent) {
		return Math.round(numDocs * percent);
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy