io.github.repir.Repository.AutoTermDocumentFeature Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of RepIR Show documentation
The newest version!
package io.github.repir.Repository;

import io.github.repir.tools.extract.Content;
import io.github.repir.EntityReader.MapReduce.TermEntityKey;
import io.github.repir.EntityReader.MapReduce.TermEntityValue;
import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.tools.io.struct.StructuredFileIntID;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import org.apache.hadoop.mapreduce.Mapper.Context;

/**
 * A AutoTermDocumentFeature that is configured for a Repository is automatically
 * generated by the standard Repository builder in apps.Repository.Build. Other
 * TermDocumentFeatures have to be build manually.
 * @author jeroen
 * @param 
 * @param  
 */
public abstract class AutoTermDocumentFeature extends TermDocumentFeature {

   HashMap docs;
   int reducetermid;

   public AutoTermDocumentFeature(Repository repository, String field) {
      super(repository, field);
   }

   public void setDocs(HashMap docs) {
      this.docs = docs;
      reducetermid = 0;
   }
   
   TermEntityKey outkey;
   TermEntityValue outvalue = new TermEntityValue();

   public void writeMap(Context context, int partition, int feature, String docname, Content entity) throws IOException, InterruptedException {
       HashMap> tokens = getTokens(entity);
       for (Entry> entry : tokens.entrySet()) {
          outkey = TermEntityKey.createTermDocKey(partition, feature, entry.getKey(), docname);
          setMapOutputValue(outvalue, docname, entry.getValue());
          context.write(outkey, outvalue);
       }
   }
   
   public abstract void reduceInput(TermEntityKey key, Iterable values);
   
   public HashMap> getTokens(Content doc) {
      HashMap> list = new HashMap>();
      ArrayList l;
      int pos = 0;
      
      ExtractChannel attr = doc.get(entityAttribute());
      if (attr.tokenized == null) {
         attr.tokenized = repository.tokenize(attr);
      }
      for (int token : attr.tokenized) {
         l = list.get(token);
         if (l == null) {
            l = new ArrayList();
            list.put(token, l);
         }
         l.add(pos++);
      }
      return list;
   }

   @Override
   public void openRead() {
      super.openRead();
      //log.info("openRead termid %d", term.getID());
      if (term.exists()) {
         find(term.getID());
         docid = -1;
      }
   }

   public void startReduce(int partition, int buffersize) {
      setPartition(partition);
      getFile().setBufferSize(buffersize);
      getFile().openWrite();
   }

   public void finishReduce() {
      getFile().closeWrite();
   }

   abstract public void setMapOutputValue(TermEntityValue writer, String docname, ArrayList pos);

}