io.github.repir.Repository.AutoTermDocumentFeature Maven / Gradle / Ivy
The newest version!
package io.github.repir.Repository;
import io.github.repir.tools.extract.Content;
import io.github.repir.EntityReader.MapReduce.TermEntityKey;
import io.github.repir.EntityReader.MapReduce.TermEntityValue;
import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.tools.io.struct.StructuredFileIntID;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import org.apache.hadoop.mapreduce.Mapper.Context;
/**
* A AutoTermDocumentFeature that is configured for a Repository is automatically
* generated by the standard Repository builder in apps.Repository.Build. Other
* TermDocumentFeatures have to be build manually.
* @author jeroen
* @param
* @param
*/
public abstract class AutoTermDocumentFeature extends TermDocumentFeature {
HashMap docs;
int reducetermid;
public AutoTermDocumentFeature(Repository repository, String field) {
super(repository, field);
}
public void setDocs(HashMap docs) {
this.docs = docs;
reducetermid = 0;
}
TermEntityKey outkey;
TermEntityValue outvalue = new TermEntityValue();
public void writeMap(Context context, int partition, int feature, String docname, Content entity) throws IOException, InterruptedException {
HashMap> tokens = getTokens(entity);
for (Entry> entry : tokens.entrySet()) {
outkey = TermEntityKey.createTermDocKey(partition, feature, entry.getKey(), docname);
setMapOutputValue(outvalue, docname, entry.getValue());
context.write(outkey, outvalue);
}
}
public abstract void reduceInput(TermEntityKey key, Iterable values);
public HashMap> getTokens(Content doc) {
HashMap> list = new HashMap>();
ArrayList l;
int pos = 0;
ExtractChannel attr = doc.get(entityAttribute());
if (attr.tokenized == null) {
attr.tokenized = repository.tokenize(attr);
}
for (int token : attr.tokenized) {
l = list.get(token);
if (l == null) {
l = new ArrayList();
list.put(token, l);
}
l.add(pos++);
}
return list;
}
@Override
public void openRead() {
super.openRead();
//log.info("openRead termid %d", term.getID());
if (term.exists()) {
find(term.getID());
docid = -1;
}
}
public void startReduce(int partition, int buffersize) {
setPartition(partition);
getFile().setBufferSize(buffersize);
getFile().openWrite();
}
public void finishReduce() {
getFile().closeWrite();
}
abstract public void setMapOutputValue(TermEntityValue writer, String docname, ArrayList pos);
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy