All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.EntityReader.EntityReaderCW12 Maven / Gradle / Ivy

The newest version!
package io.github.repir.EntityReader;

import io.github.repir.tools.extract.Content;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.lib.ByteTools;
import io.github.repir.tools.lib.Log;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import io.github.repir.tools.search.ByteRegex;
import io.github.repir.tools.search.ByteSearchPosition;
import io.github.repir.tools.io.EOCException;
import java.io.ByteArrayOutputStream;

/**
 * An implementation of EntityReader that reads the ClueWeb12 collection, 
 * similar to {@link EntityReaderCW}, just some differences in Record structure.
 * 

* @author jeroen */ public class EntityReaderCW12 extends EntityReader { public static Log log = new Log(EntityReaderCW12.class); private byte[] warcTag = "WARC/1.0".getBytes(); private byte[] contentlengthtag = "\nContent-Length: ".getBytes(); private byte[] doctype = " 0) { int warcheaderend = pos.endpos; int startdoctype = io.github.repir.tools.lib.ByteTools.find(entitywritable.content, doctype, pos.startpos, entitywritable.content.length - pos.startpos, false, false); if (startdoctype > 0) { int enddoctype = 1 + ByteTools.find(entitywritable.content, (byte) '>', startdoctype, entitywritable.content.length); entitywritable.addSectionPos("warcheader", entitywritable.content, 0, 0, warcheaderend, warcheaderend); entitywritable.addSectionPos("all", entitywritable.content, enddoctype, enddoctype, entitywritable.content.length, entitywritable.content.length); } } return true; } } } } return false; } private int getLength(Position pos) { int lengthstart = io.github.repir.tools.lib.ByteTools.find(entitywritable.content, contentlengthtag, pos.startpos, entitywritable.content.length - pos.startpos, false, false); if (lengthstart >= 0) { pos.startpos = lengthstart + contentlengthtag.length; pos.endpos = ByteTools.find(entitywritable.content, (byte) '\n', pos.startpos, entitywritable.content.length); if (pos.endpos > pos.startpos) { String length = new String(entitywritable.content, pos.startpos, pos.endpos - pos.startpos).trim(); if (Character.isDigit(length.charAt(0))) { return Integer.parseInt(length); } } } return -1; } private void readEntity() { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); entitywritable = new Content(); key.set(fsin.getOffset()); int match = 0; while (true) { try { int b = fsin.readByte(); if (match > 0 && b != warcTag[match]) { // output falsely cached chars buffer.write(warcTag, 0, match); match = 0; } if (b == warcTag[match]) { // check if we're matching needle match++; if (match >= warcTag.length) { break; } } else { buffer.write(b); } } catch (EOCException ex) { buffer.write(warcTag, 0, match); break; } } entitywritable.content = buffer.toByteArray(); } public String getDir(Path p) { String file = p.toString(); int pos = file.lastIndexOf('/'); int pos2 = file.lastIndexOf('/', pos - 1); if (pos < 0 || pos2 < 0) { log.fatal("illegal path %s", file); } return file.substring(pos2 + 1, pos); } class Position { int startpos; int endpos; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy