io.github.repir.Repository.VocMem4 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of RepIR Show documentation
The newest version!
package io.github.repir.Repository;

import io.github.repir.tools.io.buffer.BufferReaderWriter;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.struct.StructuredFileSort;
import io.github.repir.tools.io.struct.StructuredFileCollision;
import io.github.repir.tools.lib.Log;
import io.github.repir.Repository.VocMem4.File;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.lib.ByteTools;
import io.github.repir.tools.io.struct.StructuredFileCollisionRecord;
import io.github.repir.tools.io.struct.StructuredFileSortRecord;

/**
 * Fetches the internal term id for a term string. To improve lookup speed, the
 * most common terms are kept in memory, while less common terms remain on disk.
 * 
 * Before requesting the internal term id, the text should be processed by the
 * same {@link Extractor} process as used for indexing.
 * {@link #get(java.lang.String)} is used to obtain the term id of a single
 * term, while {@link #getContent(Extractor.EntityAttribute)} is used to obtain
 * an array of term id's to represent a multi term text.
 * 

 * This implementation is limited to vocabularies of max 2^32.
 * @author jeroen
 */
public class VocMem4 extends VocabularyToIDRAM {

   public static Log log = new Log(VocMem4.class);

   public VocMem4(Repository repository) {
      super(repository);
   }

   public static VocMem4 get(Repository repository) {
       String label = canonicalName(VocMem4.class);
       VocMem4 termid = (VocMem4)repository.getStoredFeature(label);
       if (termid == null) {
          termid = new VocMem4(repository);
          repository.storeFeature(label, termid);
       }
       return termid;
   }
   
   public void openRead() {
      super.openRead();
   }
   
   @Override
   public void openWrite() {
      getFile().setTableSize(repository.getVocabularySize());
      file.setBufferSize(1000000);
      file.openWrite();
   }

   public int get(String term) {
      Record record = createRecord();
      record.term = term;
      Integer termid = io.github.repir.tools.lib.Const.NULLINT;
      Record found = (Record) file.find(record);
      if (found != null) {
         termid = found.id;
      }
      // dont need to lookup from disk if it fits into memory
      //      if (termid < 0) {
      //         termid = termfile.value(term);
      //      }
      return termid;
   }

   @Override
   public File createFile(Datafile datafile) {
      return new File(datafile);
   }

   public Record createRecord() {
      return new Record(file);
   }
   
   @Override
   public void reduceInput(int id, String term, long cf, long df) {
      Record record = createRecord();
      record.id = id;
      record.term = term;
      record.cf = cf;
      record.write();
   }

   @Override
   public void startReduce(long corpustermfreq, int corpusdocumentfrequency) {
      openWrite();
   }

   @Override
   public void finishReduce() {
      closeWrite();
   }
   
   public static void build(Repository repository) throws EOCException {
      VocMem4 vocmem4 = VocMem4.get(repository);
      vocmem4.startReduce(0, 0);
      TermCF termtf = TermCF.get(repository);
      termtf.openRead();
      termtf.getFile().setBufferSize(10000000);
      TermString termstring = TermString.get(repository);
      termstring.getFile().openRead();
      termstring.getFile().setBufferSize(10000000);
      for (int id = 0; id < repository.getVocabularySize(); id++) {
         long cf = termtf.file.cf.read();
         String term = termstring.file.term.read();
         //log.info("%d %s %d", id, term, cf);
         vocmem4.reduceInput(id, term, cf, 0);
      }
      vocmem4.finishReduce();
   }

   public static void main(String[] args) throws EOCException {
      Repository repository = new Repository( args );
      build( repository );
   }

   /**
    * The terms are sorted in a collision table, based on hashcode of the term,
    * and secondary on cf desc. The memory table can contain the first 2^24
    * terms from the file (sorted on cf desc), if a term is missing it should be
    * looked up in the disk-stored term file.
    * 

    * This file used during indexing, to improve the speed of converting
    * tokenized content into termID's.
    * 

    * @author jeroen
    */
   public class File extends StructuredFileCollision {

      public String0Field term = this.addString0("term");
      public LongField cf = this.addLong("cf");
      public IntField id = this.addInt("id");

      public File(Datafile df) {
         super(df);
      }

    @Override
    protected int spillThreshold() {
        return 1000000;
    }

      public File clone() {
         File f = new File(new Datafile(getDatafile()));
         f.setTableSize(this.getTableSize());
         return f;
      }
      
      @Override
      public void openRead() {
         this.remove(cf);
         super.openRead();
      }

      @Override
      public void openWriteFinal() {
         this.remove(cf);
         super.openWriteFinal();
      }

      @Override
      public StructuredFileSortRecord createRecord() {
         Record record = new Record( this );
         record.id = id.value;
         record.term = term.value;
         record.cf = cf.value;
         return record;
      }

      /**
       * used to reversely sort colliding entries based on cf, to slightly
       * improve performance.
       */
      @Override
      public int secondaryCompare(StructuredFileSort o1, StructuredFileSort o2) {
         return ((File) o1).cf.value > ((File) o2).cf.value ? -1 : 1;
      }

      @Override
      public int secondaryCompare(StructuredFileSortRecord o1, StructuredFileSortRecord o2) {
         return ((Record) o1).cf > ((Record) o2).cf ? -1 : 1;
      }

      /**
       * this function is called by the internal {@link #find(Content.StructuredFileCollision.SortableCollisionRecord)
       * }
       * method, which seeks the offset at which the bucketIndex is position.
       * Because this is a collision table, if a matching entry exists, it is
       * always placed after this offset, as close to the offset as allowed, but
       * there can be other entries in between. An implementing function should
       * therefore readValue until the entry is found or a hashcode is
       * encountered that is greater than the search key's hashcode. In the
       * latter case, the entry does not exist and null should be returned.
       * 
       * @param table the resident table to search, with the offset pointing to
       * the
       * @param r the entry containing the term string to search for
       * @return a matching entry, with its id, or null if it doesn't exist.
       */
      @Override
      public StructuredFileCollisionRecord find(BufferReaderWriter table, StructuredFileCollisionRecord r) {
         Record rr = (Record) r;
         byte needle[] = rr.term.getBytes();
         int match;
         int offset = table.bufferpos;
         while (table.bufferpos < table.end) {
            try {
               for (match = 0; match < needle.length && table.buffer[table.bufferpos + match] == needle[match]; match++);
               if (match == needle.length && table.buffer[table.bufferpos + match] == 0) {
                  table.skipString0();
                  rr.id = table.readInt();
                  return rr;
               }
               int bucketindex = ByteTools.string0HashCode(table.buffer, table.bufferpos, table.end) & (this.getBucketCapacity() - 1);
               if (bucketindex > rr.getBucketIndex()) {
                  break;
               }
               table.skipString0();
               table.skip(4);
            } catch (EOCException ex) {
               log.exception(ex, "find( %s, %s )", table, r);
            }
         }
         return null;
      }

   }
   
   public class Record extends StructuredFileCollisionRecord {

         public String term;
         public long cf;
         public int id;

         public Record( File file ) {
            super( file );
         }

         public int hashCode() {
            return term.hashCode();
         }
         
         @Override
         protected void writeRecordData() {
            ((File) file).term.write(term);
            ((File) file).id.write(id);
         }

         @Override
         protected void writeTempRecordData() {
            ((File) file).term.write(term);
            ((File) file).cf.write(cf);
            ((File) file).id.write(id);
         }

         @Override
         public boolean equals(StructuredFileCollisionRecord r) {
            return term.equals(((Record) r).term);
         }
      }

}