io.anserini.ltr.DocumentFieldContext Maven / Gradle / Ivy

Go to download
/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.ltr;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class DocumentFieldContext {
    private IndexReader reader;
    private IndexSearcher searcher;
    private String fieldName;
    public long totalTermFreq;
    public long numDocs;

    public long docSize;
    public long termCount;
    public Map termFreqs;
    public Map> termPositions;
    public List> positionTerm;
    private Map docFreqs;
    private Map collectionFreqs;
    private Map>> postings;
    private Map, Integer> bigramCollectionFreqs;

    public List mean_score;
    public List  min_score;
    public List  max_score;
    public List  hmean_score;
    public List var_score;
    public List  quartile_score;

    public DocumentFieldContext(IndexReader reader, IndexSearcher searcher, String fieldName){
        this.reader = reader;
        this.searcher = searcher;
        this.fieldName = fieldName;
        try {
            numDocs = reader.getDocCount(fieldName);
            totalTermFreq = reader.getSumTotalTermFreq(fieldName);
        } catch (IOException e) {
//            e.printStackTrace();
            numDocs = 0;
            totalTermFreq = 0;
        }
        docFreqs = new HashMap<>();
        collectionFreqs = new HashMap<>();
        postings = new HashMap<>();
        bigramCollectionFreqs = new HashMap<>();

        mean_score = new ArrayList<>();
        min_score =new ArrayList<>();
        max_score = new ArrayList<>();
        hmean_score = new ArrayList<>();
        var_score = new ArrayList<>();
        quartile_score = new ArrayList<>();

    }

    public Integer getDocFreq(String queryToken) {
        try{
            if(!docFreqs.containsKey(queryToken))
                docFreqs.put(queryToken, reader.docFreq(new Term(this.fieldName, queryToken)));
            return docFreqs.get(queryToken);
        } catch (IOException e){
//            e.printStackTrace();
            return 0;
        }
    }
    
    public Long getCollectionFreq(String queryToken) {
        try{
            if(!collectionFreqs.containsKey(queryToken))
                collectionFreqs.put(queryToken, reader.totalTermFreq(new Term(this.fieldName, queryToken)));
            return collectionFreqs.get(queryToken);
        } catch (IOException e){
//            e.printStackTrace();
            return 0L;
        }
    }

    public void updateDoc(int internalId){
        try {
            Terms termVector = reader.getTermVector(internalId, fieldName);
            if(termVector == null) throw new IOException("empty field");
            docSize = termVector.getSumTotalTermFreq();
            termCount = termVector.size();

            termFreqs = new HashMap<>();
            termPositions = new HashMap<>();
            positionTerm = new ArrayList<>();

            TermsEnum termIter = termVector.iterator();
            PostingsEnum positionIter = null;
            while (termIter.next() != null) {
                String termString = termIter.term().utf8ToString();
                long termFreq = termIter.totalTermFreq();
                List positions = new ArrayList<>();

                positionIter = termIter.postings(positionIter, PostingsEnum.POSITIONS);
                positionIter.nextDoc();
                for ( int i = 0; i < termFreq; i++ ) {
                    int position = positionIter.nextPosition();
                    positions.add(position);
                    positionTerm.add(Pair.of(position,termString));
                }
                Collections.sort(positions);
                termPositions.put(termString, positions);
                termFreqs.put(termString, termFreq);
            }
            positionTerm.sort(new Comparator>() {
                @Override
                public int compare(Pair p1, Pair p2) {
                    return p1.getLeft() - p2.getLeft();
                }
            });
        } catch (IOException e) {
//            e.printStackTrace();
            docSize = 0;
            termCount = 0;

            termFreqs = new HashMap<>();
            termPositions = new HashMap<>();
            positionTerm = new ArrayList<>();
        }

    }

    public Long getTermFreq(String queryToken) {
        return termFreqs.getOrDefault(queryToken, 0L);
    }

    public int countBigram(String first, String second, int gap) {
        List firstPositions = termPositions.get(first);
        List secondPositions = termPositions.get(second);
        int count = 0;
        if(firstPositions!=null&&secondPositions!=null) {
            for(int i: firstPositions){
                for(int j: secondPositions){
                    if (i < j && j <= i+gap){
                        count++;
                    }
                }
            }
        }
        return count;
    }

    public int getBigramCollectionFreqs(String first, String second, int gap){
        Pair key = Pair.of(first, second);
        if (bigramCollectionFreqs.containsKey(key)) {
            return bigramCollectionFreqs.get(key);
        } else {
            int cf = 0;
            Map> firstPostings, secondPostings;
            firstPostings = getPostings(first);
            secondPostings = getPostings(second);

            Set needCheck = firstPostings.keySet();
            needCheck.retainAll(secondPostings.keySet());

            for(int docId:needCheck){
                List firstPositions = firstPostings.get(docId);
                List secondPositions = secondPostings.get(docId);
                for(int i: firstPositions){
                    for(int j: secondPositions){
                        if (i < j && j <= i+gap){
                            cf++;
                        }
                    }
                }
            }
            bigramCollectionFreqs.put(key, cf);
            return cf;
        }
    }

    public Map> getPostings(String term) {
        if (postings.containsKey(term)) {
            return postings.get(term);
        } else {
            Map> posting = new HashMap<>();
            try {
                Term t = new Term(fieldName, term);
                PostingsEnum postingsEnum = MultiTerms.getTermPostingsEnum(reader, fieldName, t.bytes(), PostingsEnum.POSITIONS);
                if(postingsEnum!=null) {
                    int docId;
                    while ((docId = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                        List postions = new ArrayList<>();
                        int freq = postingsEnum.freq();
                        for (int i = 0; i < freq; i++) {
                            postions.add(postingsEnum.nextPosition());
                        }
                        posting.put(docId, postions);
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            postings.put(term, posting);
            return posting;
        }
    }

    public List getAllDocID() {
        Query q = new DocValuesFieldExistsQuery(fieldName);
        List DocIDs = new ArrayList<>();
        try {
            ScoreDoc[] scoreDocs = searcher.search(q, reader.maxDoc()).scoreDocs;
            for (int i = 0; i < scoreDocs.length; i++) {
                DocIDs.add(scoreDocs[i].doc);
            }
        } catch (IOException e) {
//            e.printStackTrace();
        }
        return DocIDs;
    }

    private void buildFieldStat(List docids){
        List fieldDocLength = new ArrayList<>();
        List fieldTermCount = new ArrayList<>();
        Terms terms = null;
        for (int i: docids) {
            try {
                terms = reader.getTermVector(i, fieldName);
                fieldDocLength.add(terms.getSumTotalTermFreq());
                fieldTermCount.add(terms.size());
            } catch (IOException e) {
//                e.printStackTrace();
            }
        }
        long sum = 0;
        long squareSum = 0;
        long min = 0;
        long max = 0;
        for (long v : fieldDocLength) {
            sum += v;
            squareSum += v * v;
            if(v > max) max = v;
            if(v < min) min = v;
        }
        double avg = sum / fieldDocLength.size();
        double var = (squareSum / fieldDocLength.size() - avg * avg);
    }



}