All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Repository.TermID Maven / Gradle / Ivy

The newest version!
package io.github.repir.Repository;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import io.github.repir.Repository.TermID.File;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.Datafile.STATUS;
import io.github.repir.tools.io.struct.StructuredFileSortHash;
import io.github.repir.tools.io.struct.StructuredFileSortHashRecord;
import io.github.repir.tools.io.struct.StructuredFileSortRecord;
import io.github.repir.tools.lib.ArrayTools;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.PrintTools;

/**
 * Fetches the internal term id for a term string. To improve lookup speed, the
 * most common terms are kept in memory, while less common terms remain on disk.
 * 

* Before requesting the internal term id, the text should be processed by the * same {@link Extractor} process as used for indexing. * {@link #get(java.lang.String)} is used to obtain the term id of a single * term, while {@link #getContent(Extractor.EntityAttribute)} is used to obtain * an array of term id's to represent a multi term text. *

* @author jeroen */ public class TermID extends VocabularyToID { public static Log log = new Log(TermID.class); public HashMap cache = new HashMap(); private TermID(Repository repository) { super(repository); readCache(); } public static TermID get(Repository repository) { String label = canonicalName(TermID.class); TermID termid = (TermID)repository.getStoredFeature(label); if (termid == null) { termid = new TermID(repository); repository.storeFeature(label, termid); } return termid; } public void readCache() { ArrayList termids = repository.configuredIntList("repository.cachedtermids"); if (termids.size() > 0) { String termstrings[] = repository.configuredStrings("repository.cachedtermstring"); for (int i = 0; i < termids.size() && i < termstrings.length; i++) { if (termstrings[i].length() > 0) { int termid = termids.get(i); cache.put(termstrings[i], termid); } } } } @Override public void writeCache() { ArrayList termids = repository.configuredIntList("repository.cachedtermids"); for (Integer s : cache.values()) if (!termids.contains(s)) termids.add(s); if (termids.size() > 0) { ArrayList termstrings = new ArrayList(); NEXT: for (Integer i : termids) { for (Map.Entry entry : cache.entrySet()) { if (entry.getValue().equals(i)) { termstrings.add(entry.getKey()); continue NEXT; } } termstrings.add(""); } repository.getConf().setIntList("repository.cachedtermids", termids); repository.getConf().setStringList("repository.cachedtermstring", termstrings); } } @Override public int get(String term) { Integer tid = cache.get(term); if (tid != null) return tid; if (getFile().getDatafile().status != STATUS.READ) { openRead(); } int termid = io.github.repir.tools.lib.Const.NULLINT; Record termrecord = new Record(file); termrecord.term = term; Record termfound = (Record) termrecord.find(); if (termfound != null) { termid = termfound.id; } else { log.info("Term not found %s repo %s", term, repository.getTestsetName()); log.info("TermID file %s", file.getDatafile().getCanonicalPath()); //log.crash(); } cache.put(term, termid); return termid; } @Override public void openWrite() { getFile().setBufferSize(100000); file.setTableSize(repository.getVocabularySize()); file.openWrite(); } public void write(int id, String term) { Record termrecord = new Record(file); termrecord.id = id; termrecord.term = term; termrecord.write(); } @Override public boolean exists(String term) { return get(term) >= 0; } @Override public File createFile(Datafile datafile) { return new File(datafile, repository.getVocabularySize()); } @Override public void reduceInput(int id, String term, long cf, long df) { write(id, term); } @Override public void startReduce(long corpustermfreq, int corpusdocumentfrequency) { this.openWrite(); } @Override public void finishReduce() { closeWrite(); } public class File extends StructuredFileSortHash { public String0Field term = this.addString0("term"); public IntField id = this.addInt("id"); public File(Datafile df, int tablesize) { super(df, tablesize); } @Override protected int spillThreshold() { return 1000000; } public File clone() { return new File( new Datafile(getDatafile()), getTableSize() ); } @Override public StructuredFileSortRecord createRecord() { Record r = new Record(this); r.offsetread = this.recordoffset; r.id = id.value; r.term = term.value; return r; } } public class Record extends StructuredFileSortHashRecord { public String term; public int id; public long offsetread; public Record(File file) { super(file); } public int hashCode() { return term.hashCode(); } public String toString() { return PrintTools.sprintf("hash %d bucket %d id %d term %s offsetread %d", hashCode(), getBucketIndex(), id, term, offsetread); } @Override protected void writeRecordData() { //log.printf("writeRecordData() cap %d bucket %d id %d term %s offr %d offw %d", file.getBucketCapacity(), this.getBucketIndex(), id, term, offsetread, file.getOffset()); ((File) file).term.write(term); ((File) file).id.write(id); } @Override public boolean equals(Object r) { if (r instanceof Record) return term.equals(((Record) r).term); return false; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy