All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Repository.VocMem4 Maven / Gradle / Ivy

The newest version!
package io.github.repir.Repository;

import io.github.repir.tools.io.buffer.BufferReaderWriter;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.struct.StructuredFileSort;
import io.github.repir.tools.io.struct.StructuredFileCollision;
import io.github.repir.tools.lib.Log;
import io.github.repir.Repository.VocMem4.File;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.lib.ByteTools;
import io.github.repir.tools.io.struct.StructuredFileCollisionRecord;
import io.github.repir.tools.io.struct.StructuredFileSortRecord;

/**
 * Fetches the internal term id for a term string. To improve lookup speed, the
 * most common terms are kept in memory, while less common terms remain on disk.
 * 

* Before requesting the internal term id, the text should be processed by the * same {@link Extractor} process as used for indexing. * {@link #get(java.lang.String)} is used to obtain the term id of a single * term, while {@link #getContent(Extractor.EntityAttribute)} is used to obtain * an array of term id's to represent a multi term text. *

* This implementation is limited to vocabularies of max 2^32. * @author jeroen */ public class VocMem4 extends VocabularyToIDRAM { public static Log log = new Log(VocMem4.class); public VocMem4(Repository repository) { super(repository); } public static VocMem4 get(Repository repository) { String label = canonicalName(VocMem4.class); VocMem4 termid = (VocMem4)repository.getStoredFeature(label); if (termid == null) { termid = new VocMem4(repository); repository.storeFeature(label, termid); } return termid; } public void openRead() { super.openRead(); } @Override public void openWrite() { getFile().setTableSize(repository.getVocabularySize()); file.setBufferSize(1000000); file.openWrite(); } public int get(String term) { Record record = createRecord(); record.term = term; Integer termid = io.github.repir.tools.lib.Const.NULLINT; Record found = (Record) file.find(record); if (found != null) { termid = found.id; } // dont need to lookup from disk if it fits into memory // if (termid < 0) { // termid = termfile.value(term); // } return termid; } @Override public File createFile(Datafile datafile) { return new File(datafile); } public Record createRecord() { return new Record(file); } @Override public void reduceInput(int id, String term, long cf, long df) { Record record = createRecord(); record.id = id; record.term = term; record.cf = cf; record.write(); } @Override public void startReduce(long corpustermfreq, int corpusdocumentfrequency) { openWrite(); } @Override public void finishReduce() { closeWrite(); } public static void build(Repository repository) throws EOCException { VocMem4 vocmem4 = VocMem4.get(repository); vocmem4.startReduce(0, 0); TermCF termtf = TermCF.get(repository); termtf.openRead(); termtf.getFile().setBufferSize(10000000); TermString termstring = TermString.get(repository); termstring.getFile().openRead(); termstring.getFile().setBufferSize(10000000); for (int id = 0; id < repository.getVocabularySize(); id++) { long cf = termtf.file.cf.read(); String term = termstring.file.term.read(); //log.info("%d %s %d", id, term, cf); vocmem4.reduceInput(id, term, cf, 0); } vocmem4.finishReduce(); } public static void main(String[] args) throws EOCException { Repository repository = new Repository( args ); build( repository ); } /** * The terms are sorted in a collision table, based on hashcode of the term, * and secondary on cf desc. The memory table can contain the first 2^24 * terms from the file (sorted on cf desc), if a term is missing it should be * looked up in the disk-stored term file. *

* This file used during indexing, to improve the speed of converting * tokenized content into termID's. *

* @author jeroen */ public class File extends StructuredFileCollision { public String0Field term = this.addString0("term"); public LongField cf = this.addLong("cf"); public IntField id = this.addInt("id"); public File(Datafile df) { super(df); } @Override protected int spillThreshold() { return 1000000; } public File clone() { File f = new File(new Datafile(getDatafile())); f.setTableSize(this.getTableSize()); return f; } @Override public void openRead() { this.remove(cf); super.openRead(); } @Override public void openWriteFinal() { this.remove(cf); super.openWriteFinal(); } @Override public StructuredFileSortRecord createRecord() { Record record = new Record( this ); record.id = id.value; record.term = term.value; record.cf = cf.value; return record; } /** * used to reversely sort colliding entries based on cf, to slightly * improve performance. */ @Override public int secondaryCompare(StructuredFileSort o1, StructuredFileSort o2) { return ((File) o1).cf.value > ((File) o2).cf.value ? -1 : 1; } @Override public int secondaryCompare(StructuredFileSortRecord o1, StructuredFileSortRecord o2) { return ((Record) o1).cf > ((Record) o2).cf ? -1 : 1; } /** * this function is called by the internal {@link #find(Content.StructuredFileCollision.SortableCollisionRecord) * } * method, which seeks the offset at which the bucketIndex is position. * Because this is a collision table, if a matching entry exists, it is * always placed after this offset, as close to the offset as allowed, but * there can be other entries in between. An implementing function should * therefore readValue until the entry is found or a hashcode is * encountered that is greater than the search key's hashcode. In the * latter case, the entry does not exist and null should be returned. *

* @param table the resident table to search, with the offset pointing to * the * @param r the entry containing the term string to search for * @return a matching entry, with its id, or null if it doesn't exist. */ @Override public StructuredFileCollisionRecord find(BufferReaderWriter table, StructuredFileCollisionRecord r) { Record rr = (Record) r; byte needle[] = rr.term.getBytes(); int match; int offset = table.bufferpos; while (table.bufferpos < table.end) { try { for (match = 0; match < needle.length && table.buffer[table.bufferpos + match] == needle[match]; match++); if (match == needle.length && table.buffer[table.bufferpos + match] == 0) { table.skipString0(); rr.id = table.readInt(); return rr; } int bucketindex = ByteTools.string0HashCode(table.buffer, table.bufferpos, table.end) & (this.getBucketCapacity() - 1); if (bucketindex > rr.getBucketIndex()) { break; } table.skipString0(); table.skip(4); } catch (EOCException ex) { log.exception(ex, "find( %s, %s )", table, r); } } return null; } } public class Record extends StructuredFileCollisionRecord { public String term; public long cf; public int id; public Record( File file ) { super( file ); } public int hashCode() { return term.hashCode(); } @Override protected void writeRecordData() { ((File) file).term.write(term); ((File) file).id.write(id); } @Override protected void writeTempRecordData() { ((File) file).term.write(term); ((File) file).cf.write(cf); ((File) file).id.write(id); } @Override public boolean equals(StructuredFileCollisionRecord r) { return term.equals(((Record) r).term); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy