io.github.repir.EntityReader.EntityReaderCW122 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of RepIR Show documentation
The newest version!
package io.github.repir.EntityReader;

import io.github.repir.tools.extract.Content;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.lib.ByteTools;
import io.github.repir.tools.lib.Log;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import io.github.repir.tools.search.ByteSearch;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.search.ByteSection;
import java.io.ByteArrayOutputStream;

/**
 * An implementation of EntityReader that reads the ClueWeb12 collection, 
 * similar to {@link EntityReaderCW}, just some differences in Record structure.
 * 
 * @author jeroen
 */
public class EntityReaderCW122 extends EntityReader {

   public static Log log = new Log(EntityReaderCW122.class);
   private byte[] warcTag = "WARC/1.0".getBytes();
   private ByteSearch doctype = ByteSearch.create("");
   private ByteSearch contentlength = ByteSearch.create("\nContent-Length: ");
   private ByteSearch warcIDTag = ByteSearch.create("WARC-TREC-ID: ");
   private ByteSearch eol = ByteSearch.create("\n");
   private ByteSection warcID = new ByteSection(warcIDTag, eol); 
   private idlist ids;

   @Override
   public void initialize(FileSplit fileSplit) {
      Path file = fileSplit.getPath();
      String directory = getDir(file);
      String idlist = conf.get("repository.idlist", null);
      if (idlist != null) {
         ids = SubSetFile.getIdList(new Datafile(filesystem, idlist + "/" + directory + ".idlist"));
      }
      readEntity(); // skip the first warc tag, isn't a document
   }

   @Override
   public boolean nextKeyValue() {
      while (fsin.hasMore()) {
         readEntity();
         Position pos = new Position();
         String id = warcID.getFirstString(entitywritable.content, 0, entitywritable.content.length);

               if (id.length() == 25 && (ids == null || ids.get(id))) {
                  //log.info("id %s", id);
                  entitywritable.get("collectionid").add(id);
                  int recordlength = getLength(pos);
                  if (recordlength > 0) {
                     int warcheaderend = pos.endpos;
                     int startdoctype = doctype.find(entitywritable.content, pos.startpos, entitywritable.content.length);
                     if (startdoctype > 0) {
                        int enddoctype = 1 + ByteTools.find(entitywritable.content, (byte) '>', startdoctype, entitywritable.content.length);
                        entitywritable.addSectionPos("warcheader", 
                                entitywritable.content, 0, 0, warcheaderend, warcheaderend);
                        entitywritable.addSectionPos("all", 
                                entitywritable.content, enddoctype, enddoctype, entitywritable.content.length, entitywritable.content.length);
                     }
                  }
                  key.set(fsin.getOffset());
                  return true;
               }
  
      }
      return false;
   }

   private int getLength(Position pos) {
      int lengthend = contentlength.findEnd(entitywritable.content, pos.startpos, entitywritable.content.length - pos.startpos);
      if (lengthend >= 0) {
         pos.startpos = lengthend;
         pos.endpos = ByteTools.find(entitywritable.content, (byte) '\n', pos.startpos, entitywritable.content.length);
         if (pos.endpos > pos.startpos) {
            String length = new String(entitywritable.content, pos.startpos, pos.endpos - pos.startpos).trim();
            if (Character.isDigit(length.charAt(0))) {
               return Integer.parseInt(length);
            }
         }
      }
      return -1;
   }

   private void readEntity() {
      ByteArrayOutputStream buffer = new ByteArrayOutputStream();
      entitywritable = new Content();
      key.set(fsin.getOffset());
      int match = 0;
      while (true) {
         try {
            int b = fsin.readByte();
            if (match > 0 && b != warcTag[match]) { // output falsely cached chars
               buffer.write(warcTag, 0, match);
               match = 0;
            }
            if (b == warcTag[match]) { // check if we're matching needle
               match++;
               if (match >= warcTag.length) {
                  break;
               }
            } else {
               buffer.write(b);
            }
         } catch (EOCException ex) {
            buffer.write(warcTag, 0, match);
            break;
         }
      }
      entitywritable.content = buffer.toByteArray();
   }

   public String getDir(Path p) {
      String file = p.toString();
      int pos = file.lastIndexOf('/');
      int pos2 = file.lastIndexOf('/', pos - 1);
      if (pos < 0 || pos2 < 0) {
         log.fatal("illegal path %s", file);
      }
      return file.substring(pos2 + 1, pos);
   }

   class Position {

      int startpos;
      int endpos;
   }
}