All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Repository.TermInverted Maven / Gradle / Ivy

The newest version!
package io.github.repir.Repository;

import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.struct.StructuredFileSequential;
import io.github.repir.tools.extract.Content;
import io.github.repir.EntityReader.MapReduce.TermEntityKey;
import io.github.repir.EntityReader.MapReduce.TermEntityValue;
import io.github.repir.Repository.TermInverted.File;
import io.github.repir.Retriever.Document;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.lib.Log;
import java.util.ArrayList;

/**
 * A stored feature that uses a term-document structure similar to a textbook inverted index. This 
 * data structure is best used for sparse data that is to be accessed by term, which gives an ordered
 * list of the documents in which the term appears. The base class can be extended to define the
 * exact data that needs to be stored, such as the term frequency or the list of positions of the
 * term in the document. 
 * @author jeroen
 * @param 
 * @param  
 */
public class TermInverted extends AutoTermDocumentFeature {

   public static Log log = new Log(TermInverted.class);
   static final int ZEROPOS[] = new int[0];
   DocLiteral collectionid;

   private TermInverted(Repository repository, String field) {
      super(repository, field);
      collectionid = repository.getCollectionIDFeature();
   }

   private TermInverted(Repository repository, String field, Term term) {
      super(repository, (String)field);
      collectionid = repository.getCollectionIDFeature();
      this.term = term;
   }

   public static TermInverted get(Repository repository, String field) {
       String label = canonicalName(TermInverted.class, field);
       TermInverted termid = (TermInverted)repository.getStoredFeature(label);
       if (termid == null) {
          termid = new TermInverted(repository, field);
          repository.storeFeature(label, termid);
       }
       return termid;
   }
   
   public static TermInverted get(Repository repository, String field, Term term) {
       String label = canonicalName(TermInverted.class, field, term.getProcessedTerm());
       TermInverted termid = (TermInverted)repository.getStoredFeature(label);
       if (termid == null) {
          termid = new TermInverted(repository, field, term);
          repository.storeFeature(label, termid);
       }
       return termid;
   }
   
   @Override
   public void setMapOutputValue(TermEntityValue value, String docname, ArrayList pos) {
      value.writer.write0(docname);
      value.writer.writeIncr(pos);
   }

   @Override
   public void reduceInput(TermEntityKey key, Iterable values) {
      for (; reducetermid < key.termid; reducetermid++) {
         file.setOffsetTupleStart(file.getOffetTupleEnd());
         file.recordEnd();
      }
      try {
         long offset = file.getOffset();
         for (TermEntityValue v : values) {
            String doc = v.reader.readString0();
            int docid = docs.get(doc);
            file.docid.write(docid);
            int pos[] = v.reader.readCIntIncr();
            file.data.write(pos);
         }
         file.setOffsetTupleStart(offset);
         file.recordEnd();
         reducetermid++;
      } catch (EOCException ex) {
         log.exception(ex, "ReduceInput");
      }
   }

   @Override
   public int[] getValue(Document doc) {
      if (doc.docid == docid) {
         return file.data.value;
      } else {
         return ZEROPOS;
      }
   }

   @Override
   protected int readNextID() {
      //log.info("readNextID() reader %s offset %d ceiling %d", file.reader, file.getOffset(), file.getCeiling());
      if (file.nextRecord()) {
         return file.docid.value;
      } else {
         file.data.value = ZEROPOS;
      }
      return -1;
   }

   @Override
   public File createFile(Datafile datafile) {
      return new File(datafile);
   }

   @Override
   public void decode(Document d, int reportid) {
      reader.setBuffer((byte[]) d.getReportedFeature(reportid));
      try {
         d.setReportedFeature( reportid, reader.readCIntArray() );
      } catch (EOCException ex) {
         log.fatalexception(ex, "decode %d %d %d", d.docid, d.partition, d.getReportedFeature(reportid));
      }
   }

   @Override
   public void encode(Document d, int reportid) {
      bdw.writeC( (int[]) d.getReportedFeature( reportid ) );
      d.setReportedFeature( reportid, bdw.getBytes() );
   }

   @Override
   public void report(Document doc, int reportid) {
      doc.setReportedFeature(reportid, getValue(doc) );
   }

   @Override
   public int[] valueReported(Document doc, int reportid) {
      return (int[]) doc.getReportedFeature(reportid);
   }

   public static class File extends StructuredFileSequential {

      public IntField docid = this.addInt("docid");
      public CIntIncrField data = this.addCIntIncr("pos");

      public File(Datafile df) {
         super(df);
      }

      @Override
      public void hookRecordWritten() {
         // record doesn't end until we say so
      }

      public void recordEnd() {
         super.hookRecordWritten();
      }
   }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy