
io.github.repir.Repository.DocContents Maven / Gradle / Ivy
The newest version!
package io.github.repir.Repository;
import io.github.repir.tools.extract.Content;
import io.github.repir.EntityReader.MapReduce.TermEntityKey;
import io.github.repir.EntityReader.MapReduce.TermEntityValue;
import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.Repository.DocContents.File;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.Datafile.STATUS;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.io.struct.StructuredFileSortHash;
import io.github.repir.tools.io.struct.StructuredFileSortHashRecord;
import io.github.repir.tools.io.struct.StructuredFileSortRecord;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.PrintTools;
import io.github.repir.tools.lib.StrTools;
/**
* Fetches the internal term id for a term string. To improve lookup speed, the
* most common terms are kept in memory, while less common terms remain on disk.
*
* Before requesting the internal term id, the text should be processed by the
* same {@link Extractor} process as used for indexing.
* {@link #get(java.lang.String)} is used to obtain the term id of a single
* term, while {@link #getContent(Extractor.EntityAttribute)} is used to obtain
* an array of term id's to represent a multi term text.
*
* @author jeroen
*/
public class DocContents extends StringLookupFeature {
public static Log log = new Log(DocContents.class);
private DocContents(Repository repository, String field, String key) {
super(repository, field, key);
}
public static DocContents get(Repository repository, String field, String key) {
String label = canonicalName(DocContents.class, field);
DocContents doccontents = (DocContents)repository.getStoredFeature(label);
if (doccontents == null) {
doccontents = new DocContents(repository, field, key);
repository.storeFeature(label, doccontents);
}
return doccontents;
}
@Override
public void setMapOutputValue(TermEntityValue value, Content doc) {
ExtractChannel attr = doc.get(entityAttribute());
//log.info("mapOutput %s %s", entityAttribute(), attr);
value.writer.writeStr(attr);
}
@Override
public void writeReduce(TermEntityKey key, Iterable values) {
try {
TermEntityValue value = values.iterator().next();
String t[] = value.reader.readStringArray();
//log.info("reduceInput %s %s", key.collectionid, StrTools.concat(t));
write(key.collectionid, t);
} catch (EOCException ex) {
log.fatal(ex);
}
}
@Override
public String[] get(String entityname) {
if (getFile().getDatafile().status != STATUS.READ) {
openRead();
}
String contents[] = null;
Record termrecord = new Record(file);
termrecord.entityname = entityname;
Record termfound = (Record) termrecord.find();
if (termfound != null) {
contents = termfound.contents;
} else {
log.info("DocContents not found %s", entityname);
log.info("file %s", file.getDatafile().getCanonicalPath());
//log.crash();
}
return contents;
}
@Override
public void openWrite() {
getFile().setBufferSize(1000000);
//file.setTableSize((int)repository.getDocumentCount());
file.openWrite();
}
public void write(String entityname, String contents[]) {
Record termrecord = new Record(file);
termrecord.entityname = entityname;
termrecord.contents = contents;
termrecord.write();
}
@Override
public File createFile(Datafile datafile) {
log.info("creatFile %s %d", datafile.getCanonicalPath(), repository.getDocumentCount());
return new File(datafile, (int)repository.getDocumentCount());
}
public class File extends StructuredFileSortHash {
public String0Field entityname = this.addString0("entityname");
public StringArrayField contents = this.addStringArray("contents");
public File(Datafile df, int tablesize) {
super(df, tablesize);
}
@Override
protected int spillThreshold() {
return 10000;
}
public File clone() {
return new File( new Datafile(getDatafile()), getTableSize() );
}
@Override
public StructuredFileSortRecord createRecord() {
Record r = new Record(this);
r.offsetread = this.recordoffset;
r.entityname = entityname.value;
r.contents = contents.value;
return r;
}
}
public class Record extends StructuredFileSortHashRecord {
public String entityname;
public String contents[];
public long offsetread;
public Record(File file) {
super(file);
}
public int hashCode() {
return entityname.toLowerCase().hashCode();
}
public String toString() {
return PrintTools.sprintf("hash %d bucket %d entity %s offsetread %d", hashCode(), getBucketIndex(), entityname, offsetread);
}
@Override
protected void writeRecordData() {
//log.printf("writeRecordData() cap %d bucket %d id %d term %s offr %d offw %d", file.getBucketCapacity(), this.getBucketIndex(), id, term, offsetread, file.getOffset());
((File) file).entityname.write(entityname);
((File) file).contents.write(contents);
}
@Override
public boolean equals(Object r) {
if (r instanceof Record)
return entityname.equalsIgnoreCase(((Record) r).entityname);
return false;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy