
io.github.repir.Repository.DocForward Maven / Gradle / Ivy
The newest version!
package io.github.repir.Repository;
import io.github.repir.Retriever.Document;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.struct.StructuredFileSequential;
import io.github.repir.EntityReader.MapReduce.TermEntityKey;
import io.github.repir.EntityReader.MapReduce.TermEntityValue;
import io.github.repir.tools.extract.Content;
import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.tools.lib.Log;
import io.github.repir.Repository.DocForward.File;
import io.github.repir.tools.io.EOCException;
import java.io.IOException;
import org.apache.hadoop.mapreduce.Mapper;
/**
* A forward index, that store all tokens contained in a document in the collection.
* Per document, the tokens are stored as a simple array of ints, each int being
* a TermID, and the order of the tokens is the order in which they appear in the
* original document.
* @see EntityStoredFeature
* @author jer
*/
public class DocForward extends EntityStoredFeature implements ReduciblePartitionedFeature {
public static Log log = new Log(DocForward.class);
private DocForward(Repository repository, String field) {
super(repository, field);
}
public static DocForward get(Repository repository, String field) {
String label = canonicalName(DocForward.class, field);
DocForward docforward = (DocForward)repository.getStoredFeature(label);
if (docforward == null) {
docforward = new DocForward(repository, field);
repository.storeFeature(label, docforward);
}
return docforward;
}
@Override
public void setMapOutputValue(TermEntityValue value, Content doc) {
ExtractChannel attr = doc.get(entityAttribute());
if (attr.tokenized == null) {
attr.tokenized = repository.tokenize(attr);
}
value.writer.writeC(attr.tokenized);
}
@Override
public void writeReduce(TermEntityKey key, Iterable values) {
try {
TermEntityValue value = values.iterator().next();
int t[] = value.reader.readCIntArray();
write(t);
} catch (EOCException ex) {
log.fatal(ex);
}
}
@Override
public void encode(Document d, int reportid) {
int forward[] = (int[]) d.getReportedFeature(reportid);
bdw.writeC(forward);
d.setReportedFeature(reportid, bdw.getBytes());
}
@Override
public void decode(Document d, int reportid) {
reader.setBuffer((byte[]) d.getReportedFeature(reportid));
try {
d.setReportedFeature(reportid, reader.readCIntArray());
} catch (EOCException ex) {
log.fatalexception(ex, "decode( %s ) reader %s reportid %d", d, reader, reportid);
}
}
public void readResident() {
cacheResults();
getFile().setBufferSize((int)this.getLength());
openRead();
while (this.next()) {
cache.put(cache.size(), getValue());
}
closeRead();
}
@Override
public void report(Document doc, int reportid) {
//log.info("report %s doc %d reportid %d value %s", this.getCanonicalName(), doc.docid, reportid, getValue());
doc.setReportedFeature(reportid, getValue());
}
@Override
public int[] valueReported(Document doc, int reportid) {
return (int[]) doc.getReportedFeature(reportid);
}
@Override
public void write(int[] value) {
file.tokens.write(value);
}
@Override
public int[] getValue() {
return file.tokens.value;
}
@Override
public File createFile(Datafile datafile) {
return new File(datafile);
}
@Override
public void setValue(int[] value) {
getFile().tokens.value = value;
}
public long findOffset(int docid) {
return getFile().findOffset(docid);
}
public void setOffset(int docid) {
getFile().setOffset(getFile().findOffset(docid));
getFile().openRead();
}
public static class File extends StructuredFileSequential {
public CIntArrayField tokens = this.addCIntArray("tokens");
public File(Datafile df) {
super(df);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy