All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.EntityReader.EntityReaderCW122 Maven / Gradle / Ivy

The newest version!
package io.github.repir.EntityReader;

import io.github.repir.tools.extract.Content;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.lib.ByteTools;
import io.github.repir.tools.lib.Log;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import io.github.repir.tools.search.ByteSearch;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.search.ByteSection;
import java.io.ByteArrayOutputStream;

/**
 * An implementation of EntityReader that reads the ClueWeb12 collection, 
 * similar to {@link EntityReaderCW}, just some differences in Record structure.
 * 

* @author jeroen */ public class EntityReaderCW122 extends EntityReader { public static Log log = new Log(EntityReaderCW122.class); private byte[] warcTag = "WARC/1.0".getBytes(); private ByteSearch doctype = ByteSearch.create(""); private ByteSearch contentlength = ByteSearch.create("\nContent-Length: "); private ByteSearch warcIDTag = ByteSearch.create("WARC-TREC-ID: "); private ByteSearch eol = ByteSearch.create("\n"); private ByteSection warcID = new ByteSection(warcIDTag, eol); private idlist ids; @Override public void initialize(FileSplit fileSplit) { Path file = fileSplit.getPath(); String directory = getDir(file); String idlist = conf.get("repository.idlist", null); if (idlist != null) { ids = SubSetFile.getIdList(new Datafile(filesystem, idlist + "/" + directory + ".idlist")); } readEntity(); // skip the first warc tag, isn't a document } @Override public boolean nextKeyValue() { while (fsin.hasMore()) { readEntity(); Position pos = new Position(); String id = warcID.getFirstString(entitywritable.content, 0, entitywritable.content.length); if (id.length() == 25 && (ids == null || ids.get(id))) { //log.info("id %s", id); entitywritable.get("collectionid").add(id); int recordlength = getLength(pos); if (recordlength > 0) { int warcheaderend = pos.endpos; int startdoctype = doctype.find(entitywritable.content, pos.startpos, entitywritable.content.length); if (startdoctype > 0) { int enddoctype = 1 + ByteTools.find(entitywritable.content, (byte) '>', startdoctype, entitywritable.content.length); entitywritable.addSectionPos("warcheader", entitywritable.content, 0, 0, warcheaderend, warcheaderend); entitywritable.addSectionPos("all", entitywritable.content, enddoctype, enddoctype, entitywritable.content.length, entitywritable.content.length); } } key.set(fsin.getOffset()); return true; } } return false; } private int getLength(Position pos) { int lengthend = contentlength.findEnd(entitywritable.content, pos.startpos, entitywritable.content.length - pos.startpos); if (lengthend >= 0) { pos.startpos = lengthend; pos.endpos = ByteTools.find(entitywritable.content, (byte) '\n', pos.startpos, entitywritable.content.length); if (pos.endpos > pos.startpos) { String length = new String(entitywritable.content, pos.startpos, pos.endpos - pos.startpos).trim(); if (Character.isDigit(length.charAt(0))) { return Integer.parseInt(length); } } } return -1; } private void readEntity() { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); entitywritable = new Content(); key.set(fsin.getOffset()); int match = 0; while (true) { try { int b = fsin.readByte(); if (match > 0 && b != warcTag[match]) { // output falsely cached chars buffer.write(warcTag, 0, match); match = 0; } if (b == warcTag[match]) { // check if we're matching needle match++; if (match >= warcTag.length) { break; } } else { buffer.write(b); } } catch (EOCException ex) { buffer.write(warcTag, 0, match); break; } } entitywritable.content = buffer.toByteArray(); } public String getDir(Path p) { String file = p.toString(); int pos = file.lastIndexOf('/'); int pos2 = file.lastIndexOf('/', pos - 1); if (pos < 0 || pos2 < 0) { log.fatal("illegal path %s", file); } return file.substring(pos2 + 1, pos); } class Position { int startpos; int endpos; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy