
io.anserini.ltr.DocumentFieldContext Maven / Gradle / Ivy
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.anserini.ltr;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class DocumentFieldContext {
private IndexReader reader;
private IndexSearcher searcher;
private String fieldName;
public long totalTermFreq;
public long numDocs;
public long docSize;
public long termCount;
public Map termFreqs;
public Map> termPositions;
public List> positionTerm;
private Map docFreqs;
private Map collectionFreqs;
private Map>> postings;
private Map, Integer> bigramCollectionFreqs;
public List mean_score;
public List min_score;
public List max_score;
public List hmean_score;
public List var_score;
public List quartile_score;
public DocumentFieldContext(IndexReader reader, IndexSearcher searcher, String fieldName){
this.reader = reader;
this.searcher = searcher;
this.fieldName = fieldName;
try {
numDocs = reader.getDocCount(fieldName);
totalTermFreq = reader.getSumTotalTermFreq(fieldName);
} catch (IOException e) {
// e.printStackTrace();
numDocs = 0;
totalTermFreq = 0;
}
docFreqs = new HashMap<>();
collectionFreqs = new HashMap<>();
postings = new HashMap<>();
bigramCollectionFreqs = new HashMap<>();
mean_score = new ArrayList<>();
min_score =new ArrayList<>();
max_score = new ArrayList<>();
hmean_score = new ArrayList<>();
var_score = new ArrayList<>();
quartile_score = new ArrayList<>();
}
public Integer getDocFreq(String queryToken) {
try{
if(!docFreqs.containsKey(queryToken))
docFreqs.put(queryToken, reader.docFreq(new Term(this.fieldName, queryToken)));
return docFreqs.get(queryToken);
} catch (IOException e){
// e.printStackTrace();
return 0;
}
}
public Long getCollectionFreq(String queryToken) {
try{
if(!collectionFreqs.containsKey(queryToken))
collectionFreqs.put(queryToken, reader.totalTermFreq(new Term(this.fieldName, queryToken)));
return collectionFreqs.get(queryToken);
} catch (IOException e){
// e.printStackTrace();
return 0L;
}
}
public void updateDoc(int internalId){
try {
Terms termVector = reader.getTermVector(internalId, fieldName);
if(termVector == null) throw new IOException("empty field");
docSize = termVector.getSumTotalTermFreq();
termCount = termVector.size();
termFreqs = new HashMap<>();
termPositions = new HashMap<>();
positionTerm = new ArrayList<>();
TermsEnum termIter = termVector.iterator();
PostingsEnum positionIter = null;
while (termIter.next() != null) {
String termString = termIter.term().utf8ToString();
long termFreq = termIter.totalTermFreq();
List positions = new ArrayList<>();
positionIter = termIter.postings(positionIter, PostingsEnum.POSITIONS);
positionIter.nextDoc();
for ( int i = 0; i < termFreq; i++ ) {
int position = positionIter.nextPosition();
positions.add(position);
positionTerm.add(Pair.of(position,termString));
}
Collections.sort(positions);
termPositions.put(termString, positions);
termFreqs.put(termString, termFreq);
}
positionTerm.sort(new Comparator>() {
@Override
public int compare(Pair p1, Pair p2) {
return p1.getLeft() - p2.getLeft();
}
});
} catch (IOException e) {
// e.printStackTrace();
docSize = 0;
termCount = 0;
termFreqs = new HashMap<>();
termPositions = new HashMap<>();
positionTerm = new ArrayList<>();
}
}
public Long getTermFreq(String queryToken) {
return termFreqs.getOrDefault(queryToken, 0L);
}
public int countBigram(String first, String second, int gap) {
List firstPositions = termPositions.get(first);
List secondPositions = termPositions.get(second);
int count = 0;
if(firstPositions!=null&&secondPositions!=null) {
for(int i: firstPositions){
for(int j: secondPositions){
if (i < j && j <= i+gap){
count++;
}
}
}
}
return count;
}
public int getBigramCollectionFreqs(String first, String second, int gap){
Pair key = Pair.of(first, second);
if (bigramCollectionFreqs.containsKey(key)) {
return bigramCollectionFreqs.get(key);
} else {
int cf = 0;
Map> firstPostings, secondPostings;
firstPostings = getPostings(first);
secondPostings = getPostings(second);
Set needCheck = firstPostings.keySet();
needCheck.retainAll(secondPostings.keySet());
for(int docId:needCheck){
List firstPositions = firstPostings.get(docId);
List secondPositions = secondPostings.get(docId);
for(int i: firstPositions){
for(int j: secondPositions){
if (i < j && j <= i+gap){
cf++;
}
}
}
}
bigramCollectionFreqs.put(key, cf);
return cf;
}
}
public Map> getPostings(String term) {
if (postings.containsKey(term)) {
return postings.get(term);
} else {
Map> posting = new HashMap<>();
try {
Term t = new Term(fieldName, term);
PostingsEnum postingsEnum = MultiTerms.getTermPostingsEnum(reader, fieldName, t.bytes(), PostingsEnum.POSITIONS);
if(postingsEnum!=null) {
int docId;
while ((docId = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
List postions = new ArrayList<>();
int freq = postingsEnum.freq();
for (int i = 0; i < freq; i++) {
postions.add(postingsEnum.nextPosition());
}
posting.put(docId, postions);
}
}
} catch (IOException e) {
e.printStackTrace();
}
postings.put(term, posting);
return posting;
}
}
public List getAllDocID() {
Query q = new DocValuesFieldExistsQuery(fieldName);
List DocIDs = new ArrayList<>();
try {
ScoreDoc[] scoreDocs = searcher.search(q, reader.maxDoc()).scoreDocs;
for (int i = 0; i < scoreDocs.length; i++) {
DocIDs.add(scoreDocs[i].doc);
}
} catch (IOException e) {
// e.printStackTrace();
}
return DocIDs;
}
private void buildFieldStat(List docids){
List fieldDocLength = new ArrayList<>();
List fieldTermCount = new ArrayList<>();
Terms terms = null;
for (int i: docids) {
try {
terms = reader.getTermVector(i, fieldName);
fieldDocLength.add(terms.getSumTotalTermFreq());
fieldTermCount.add(terms.size());
} catch (IOException e) {
// e.printStackTrace();
}
}
long sum = 0;
long squareSum = 0;
long min = 0;
long max = 0;
for (long v : fieldDocLength) {
sum += v;
squareSum += v * v;
if(v > max) max = v;
if(v < min) min = v;
}
double avg = sum / fieldDocLength.size();
double var = (squareSum / fieldDocLength.size() - avg * avg);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy