All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.EntityReader.EntityReaderWPEdits Maven / Gradle / Ivy

The newest version!
//package io.github.repir.EntityReader;
//
//import io.github.repir.Extractor.Entity;
//import io.github.repir.EntityReader.MapReduce.EntityWritable;
//import io.github.repir.tools.search.ByteSearch;
//import io.github.repir.tools.search.ByteSearchSection;
//import io.github.repir.tools.search.ByteSection;
//import io.github.repir.tools.Content.EOCException;
//import io.github.repir.tools.Content.HDFSIn;
//import io.github.repir.tools.Lib.Log;
//import org.apache.hadoop.fs.Path;
//import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//import io.github.repir.tools.Lib.ByteTools;
//
///**
// * An implementation of EntityReader that scans the input for Wikipedia XML
// * dumps, that are enclosed in  tags.
// * 

// * @author jeroen // */ //public class EntityReaderWPEdits extends EntityReader { // // public static Log log = new Log(EntityReaderWPEdits.class); // private byte[] startTag; // private byte[] endTag; // private ByteSection idTag = new ByteSection("",""); // private ByteSection revisionTag = new ByteSection("",""); // private ByteSection timestampTag = new ByteSection("",""); // private ByteSection contributorTag = new ByteSection("",""); // private ByteSection usernameTag = new ByteSection("",""); // private ByteSection namespaceTag = new ByteSection("",""); // private ByteSection textTag = new ByteSection("]*>",""); // private ByteSearch redirect = ByteSearch.create("").getBytes(); // endTag = conf.get("entityreader.entityend", "").getBytes(); // Path file = fileSplit.getPath(); // } // // @Override // public boolean nextKeyValue() { // while (foundValidStartBeforeEndOfSplit()) { // try { // // mark the start of the entity (exclusive of the startTag and endTag) // key.set(fsin.getOffset()); // // readEntity(); // // int starttext = posPastBodyStart(); // checkRedirectPage(starttext); // int endtext = posPastBodyEnd(starttext); // checkValidNamespace(starttext); // // String id = getId(starttext); // String title = ByteTools.extract(entitywritable.entity.content, revisionStart, revisionEnd, 0, starttext, false, false); // //log.info("id %s title %s ns %s", id, title, ns); // entitywritable.entity.addSectionPos("all", starttext + 1, starttext + 1, endtext, endtext); // entitywritable.entity.get("literaltitle").add(title); // entitywritable.entity.get("collectionid").add(id); // return true; // } catch (InvalidEntityException ex) { // // some reason was found to skip the page, like invalid namespace, redirect page // } // } // return false; // } // // /** // * @return true if a startTag starts before end of split. Sets the offset // * past the record start label. // */ // private boolean foundValidStartBeforeEndOfSplit() { // if (fsin.hasMore() && readUntilStart()) { // startTag is found before end of split // // redundant double check if offset is really before end of split // return fsin.getOffset() - startTag.length < fsin.getCeiling(); // } // return false; // } // // /** // * @return position past bodyStart and BodyStartEnd tags. Throws an // * EOCException when not found. // */ // private int posPastBodyStart() throws InvalidEntityException { // int pos = ByteTools.find(entitywritable.entity.content, bodyStart, 0, entitywritable.entity.content.length, false, false); // if (pos >= 0) { // pos = ByteTools.find(entitywritable.entity.content, bodyStartEnd, pos + bodyStart.length, entitywritable.entity.content.length, false, false); // if (pos > 0) { // return pos; // } // } // throw invalidEntity; // } // // private int posPastBodyEnd(int posAfterBodyStart) throws InvalidEntityException { // int endtext = ByteTools.find(entitywritable.entity.content, bodyEnd, posAfterBodyStart + bodyStartEnd.length, entitywritable.entity.content.length, false, false); // if (endtext >= posAfterBodyStart) { // return endtext; // } // throw invalidEntity; // } // // private void checkRedirectPage(int posAfterBodyStart) throws InvalidEntityException { // int redirectpos = ByteTools.find(entitywritable.entity.content, redirect, 0, posAfterBodyStart, false, false); // if (redirectpos >= 0) { // throw invalidEntity; // } // } // // // throws an EOCException // private void checkValidNamespace(int posAfterBodyStart) throws InvalidEntityException { // String ns = ByteTools.extract(entitywritable.entity.content, nsStart, nsEnd, 0, posAfterBodyStart, false, false); // if (!ns.trim().equals("0")) { // throw invalidEntity; // } // } // // private String getId(int posAfterBodyStart) { // return ByteTools.extract(entitywritable.entity.content, idStart, idEnd, 0, posAfterBodyStart, false, false); // } // // private String getTitle(int posAfterBodyStart) { // return ByteTools.extract(entitywritable.entity.content, idStart, idEnd, 0, posAfterBodyStart, false, false); // } // // /** // * Read an entity from the input, ignoring the end of split that may have // * been set to read the last record that crosses the boundary between // * splits. // * // * @return true if successful // */ // private boolean readEntity() throws InvalidEntityException { // entitywritable = new EntityWritable(); // entitywritable.entity = new Entity(); // int needleposition = 0; // try { // while (true) { // int b = fsin.readByte(); // if (b != endTag[needleposition]) { // check if we match needle // if (needleposition > 0) { // entitywritable.writeBytes(endTag, 0, needleposition); // needleposition = 0; // } // } // if (b == endTag[needleposition]) { // needleposition++; // if (needleposition >= endTag.length) { // entitywritable.storeContent(); // //new ByteSearchSection(entitywritable.entity.content, 0, 0, entitywritable.entity.); // return true; // } // } else { // entitywritable.writeByte(b); // } // // } // } catch (EOCException ex) { // throw invalidEntity; // } // } // // /** // * @return true if a startTag starts before end of split. Sets the offset // * past the record start label. // */ // private boolean readUntilStart() { // int needleposition = 0; // while (true) { // try { // int b = fsin.readByte(); // if (b != startTag[needleposition]) { // check if we match needle // needleposition = 0; // } // if (b == startTag[needleposition]) { // needleposition++; // if (needleposition >= startTag.length) { // return true; // } // } else { // if (needleposition == 0 && !fsin.hasMore()) { // see if we've passed the stop point: // return false; // } // } // } catch (EOCException ex) { // return false; // } // } // } // // static InvalidEntityException invalidEntity = new InvalidEntityException(); // // private static class InvalidEntityException extends Exception { // } //}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy