All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester Maven / Gradle / Ivy

Go to download

Library providing ingesters and utilities for converting BOLT forum posts to the Concrete NLP data schema.

There is a newer version: 4.14.2
Show newest version
/*
 * Copyright 2012-2015 Johns Hopkins University HLTCOE. All rights reserved.
 * See LICENSE in the project root directory.
 */
package edu.jhu.hlt.concrete.ingesters.bolt;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.TextSpan;
import edu.jhu.hlt.concrete.communications.WritableCommunication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester;
import edu.jhu.hlt.concrete.metadata.tools.SafeTooledAnnotationMetadata;
import edu.jhu.hlt.concrete.metadata.tools.TooledMetadataConverter;
import edu.jhu.hlt.concrete.section.SectionFactory;
import edu.jhu.hlt.concrete.util.ConcreteException;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
import edu.jhu.hlt.utilt.ex.LoggedUncaughtExceptionHandler;
import edu.jhu.hlt.utilt.io.ExistingNonDirectoryFile;
import edu.jhu.hlt.utilt.io.NotFileException;

/**
 * Class representing a Concrete ingester for BOLT forum post data.
 *
 * Currently only extracts the headline and posts from the document.
 */
public class BoltForumPostIngester implements SafeTooledAnnotationMetadata, UTF8FileIngester {

  private static final Logger LOGGER = LoggerFactory.getLogger(BoltForumPostIngester.class);

  public static final String POST_LOCAL_NAME = "post";
  public static final String IMG_LOCAL_NAME = "img";
  public static final String QUOTE_LOCAL_NAME = "quote";
  public static final String LINK_LOCAL_NAME = "a";

  private final XMLInputFactory inF;

  /**
   *
   */
  public BoltForumPostIngester() {
    this.inF = XMLInputFactory.newInstance();
    // this.inF.setProperty(XMLInputFactory.IS_COALESCING, true);
    // this.inF.setProperty(XMLInputFactory.IS_VALIDATING, false);
  }

  /* (non-Javadoc)
   * @see edu.jhu.hlt.concrete.safe.metadata.SafeAnnotationMetadata#getTimestamp()
   */
  @Override
  public long getTimestamp() {
    return Timing.currentLocalTime();
  }

  /* (non-Javadoc)
   * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolName()
   */
  @Override
  public String getToolName() {
    return BoltForumPostIngester.class.getSimpleName();
  }

  /* (non-Javadoc)
   * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolVersion()
   */
  @Override
  public String getToolVersion() {
    return ProjectConstants.VERSION;
  }

  /* (non-Javadoc)
   * @see edu.jhu.hlt.concrete.ingesters.base.Ingester#getKind()
   */
  @Override
  public String getKind() {
    return "forum-post";
  }

  private static Section handleHeadline(final XMLEventReader rdr, final String content) throws XMLStreamException, ConcreteException {
    // The first type is always a document start event. Skip it.
    rdr.nextEvent();

    // The second type is a document ID block. Skip it.
    rdr.nextEvent();

    // The third type is a whitespace block. Skip it.
    rdr.nextEvent();

    // The next type is a headline start tag.
    XMLEvent hl = rdr.nextEvent();
    StartElement hlse = hl.asStartElement();
    QName hlqn = hlse.getName();
    final String hlPart = hlqn.getLocalPart();
    LOGGER.debug("QN: {}", hlPart);
    int hlPartOff = hlse.getLocation().getCharacterOffset();
    LOGGER.debug("HL part offset: {}", hlPartOff);

    // Text of the headline. This would be useful for purely getting
    // the content, but for offsets, it's not that useful.
    Characters cc = rdr.nextEvent().asCharacters();
    int charOff = cc.getLocation().getCharacterOffset();
    int clen = cc.getData().length();

    // The next part is the headline end element. Skip.
    rdr.nextEvent();

    // Whitespace. Skip.
    rdr.nextEvent();

    // Reader is now pointing at the first post.
    // Construct section, text span, etc.
    final int charOffPlusLen = charOff + clen;
    final String hlText = content.substring(charOff, charOffPlusLen);

    SimpleImmutableEntry pads = trimSpacing(hlText);
    TextSpan ts = new TextSpan(charOff + pads.getKey(), charOffPlusLen - pads.getValue());

    Section s = new Section();
    s.setKind("headline");
    s.setTextSpan(ts);
    List intList = new ArrayList<>();
    intList.add(0);
    s.setNumberList(intList);
    return s;
  }

  private static SimpleImmutableEntry trimSpacing(final String str) {
    final int leftPadding = getLeftSpacesPaddingCount(str);
    LOGGER.trace("Left padding: {}", leftPadding);
    final int rightPadding = getRightSpacesPaddingCount(str);
    LOGGER.trace("Right padding: {}", rightPadding);
    return new SimpleImmutableEntry(leftPadding, rightPadding);
  }

  private int handleLink(final XMLEventReader rdr) throws XMLStreamException {
    // Links have a start element, characters, and end element.
    // Alternatively, they have a start and end element.
    XMLEvent linkContent = rdr.nextEvent();
    if (linkContent.isEndElement())
      return linkContent.getLocation().getCharacterOffset();
    else if (linkContent.isCharacters())
      // Skip end of link.
      return rdr.nextEvent().getLocation().getCharacterOffset();
    else
      throw new RuntimeException("Characters did not follow link.");
  }

  /**
   * Moves the rdr "iterator" past any img tags or quote tags.
   *
   * @param rdr
   * @throws XMLStreamException
   */
  private int handleNonPostStartElement(final XMLEventReader rdr) throws XMLStreamException {
    // Next is a start element. Throw if not.
    StartElement se = rdr.nextEvent().asStartElement();
    QName seqn = se.getName();
    String part = seqn.getLocalPart();

    if (part.equals(QUOTE_LOCAL_NAME)) {
      return this.handleQuote(rdr);
    } else if (part.equals(IMG_LOCAL_NAME)) {
      return this.handleImg(rdr);
    } else if (part.equals(LINK_LOCAL_NAME)) {
      return this.handleLink(rdr);
    } else
      throw new IllegalArgumentException("Unhandled tag: " + part);
  }

  /**
   * Move the iterator so that a call to nextEvent will return the beginning of a post tag.
   *
   * @param rdr
   * @throws XMLStreamException
   */
  private void iterateToPosts(final XMLEventReader rdr) throws XMLStreamException {
    // Peek at the next element.
    XMLEvent fp = rdr.peek();

    // If start element and part == "post", return.
    if (fp.isStartElement()) {
      StartElement se = fp.asStartElement();
      if (se.getName().getLocalPart().equals(POST_LOCAL_NAME))
        return;
      else
        // Churn through non-post start tags.
        this.handleNonPostStartElement(rdr);
    } else
      // Drop.
      rdr.nextEvent();

    this.iterateToPosts(rdr);
  }

  private int handleQuote(final XMLEventReader rdr) throws XMLStreamException {
    // For quotes, there will be character contents - skip for now...
    XMLEvent quoteContent = rdr.nextEvent();
    if (!quoteContent.isCharacters())
      throw new RuntimeException("Characters did not follow quote.");
    // Skip end of quote.
    XMLEvent next = rdr.nextEvent();
    // Exit loop only when next end quote is hit.
    boolean hitEndQuoteElement = false;
    while (!next.isEndElement() && !hitEndQuoteElement) {
      // Move to next element.
      next = rdr.nextEvent();
      // If next element is an end element,
      // see if it's an end quote.
      // If so, exit the loop.
      if (next.isEndElement())
        hitEndQuoteElement = next.asEndElement().getName().getLocalPart().equals("quote");
    }

    return next.getLocation().getCharacterOffset();
  }

  private int handleImg(final XMLEventReader rdr) throws XMLStreamException {
    XMLEvent n = rdr.nextEvent();
    return n.asEndElement().getLocation().getCharacterOffset();
  }

  /* (non-Javadoc)
   * @see edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester#fromCharacterBasedFile(java.nio.file.Path)
   */
  @Override
  public Communication fromCharacterBasedFile(final Path path) throws IngestException {
    if (!Files.exists(path))
      throw new IngestException("No file at: " + path.toString());

    AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
    AnalyticUUIDGenerator gen = f.create();
    Communication c = new Communication();
    c.setUuid(gen.next());
    c.setType(this.getKind());
    c.setMetadata(TooledMetadataConverter.convert(this));

    try {
      ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path);
      c.setId(ef.getName().split("\\.")[0]);
    } catch (NoSuchFileException | NotFileException e) {
      // might throw if path is a directory.
      throw new IngestException(path.toString() + " is not a file, or is a directory.");
    }

    String content;
    try (InputStream is = Files.newInputStream(path);
        BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) {
      content = IOUtils.toString(bin, StandardCharsets.UTF_8);
      c.setText(content);
    } catch (IOException e) {
      throw new IngestException(e);
    }

    try (InputStream is = Files.newInputStream(path);
        BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);
        BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) {
      XMLEventReader rdr = null;
      try {
        rdr = inF.createXMLEventReader(reader);

        // Below method moves the reader
        // to the first post element.
        Section headline = handleHeadline(rdr, content);
        headline.setUuid(gen.next());
        c.addToSectionList(headline);
        String htxt = c.getText().substring(headline.getTextSpan().getStart(), headline.getTextSpan().getEnding());
        LOGGER.debug("headline text: {}", htxt);

        // Section indices.
        int sectNumber = 1;
        int subSect = 0;

        // Move iterator to post start element.
        this.iterateToPosts(rdr);

        // Offset pointer.
        int currOff = -1;

        SectionFactory sf = new SectionFactory(gen);

        // First post element.
        while (rdr.hasNext()) {
          XMLEvent nextEvent = rdr.nextEvent();
          currOff = nextEvent.getLocation().getCharacterOffset();
          if (currOff > 0) {
            int currOffPlus = currOff + 20;
            int currOffLess = currOff - 20;
            LOGGER.debug("Offset: {}", currOff);
            if (currOffPlus < content.length())
              LOGGER.debug("Surrounding text: {}", content.substring(currOffLess, currOffPlus));
          }

          // First: see if document is going to end.
          // If yes: exit.
          if (nextEvent.isEndDocument())
            break;

          // XMLEvent peeker = rdr.peek();

          // Check if start element.
          if (nextEvent.isStartElement()) {
            StartElement se = nextEvent.asStartElement();
            QName name = se.getName();
            final String localName = name.getLocalPart();
            LOGGER.debug("Hit start element: {}", localName);

            // Move past quotes, images, and links.
            if (localName.equals(QUOTE_LOCAL_NAME)) {
              this.handleQuote(rdr);
            } else if (localName.equals(IMG_LOCAL_NAME)) {
              this.handleImg(rdr);
            } else if (localName.equals(LINK_LOCAL_NAME)) {
              this.handleLink(rdr);
            }
          } else if (nextEvent.isCharacters()) {
            Characters chars = nextEvent.asCharacters();
            int coff = chars.getLocation().getCharacterOffset();
            if (!chars.isWhiteSpace()) {
              // content to be captured
              String fpContent = chars.getData();
              LOGGER.debug("Character offset: {}", coff);
              LOGGER.debug("Character based data: {}", fpContent);
              // LOGGER.debug("Character data via offset diff: {}", content.substring(coff - fpContent.length(), coff));

              SimpleImmutableEntry pads = trimSpacing(fpContent);
              final int tsb = currOff + pads.getKey();
              final int tse = currOff + fpContent.length() - pads.getValue();
              final String subs = content.substring(tsb, tse);
              if (subs.replaceAll("\\p{Zs}", "").replaceAll("\\n", "").isEmpty()) {
                LOGGER.info("Found empty section: skipping.");
                continue;
              }

              LOGGER.debug("Section text: {}", subs);
              TextSpan ts = new TextSpan(tsb, tse);

              Section s = sf.fromTextSpan(ts, "post");
              List intList = new ArrayList<>();
              intList.add(sectNumber);
              intList.add(subSect);
              s.setNumberList(intList);
              c.addToSectionList(s);

              subSect++;
            }
          } else if (nextEvent.isEndElement()) {
            EndElement ee = nextEvent.asEndElement();
            currOff = ee.getLocation().getCharacterOffset();
            QName name = ee.getName();
            String localName = name.getLocalPart();
            LOGGER.debug("Hit end element: {}", localName);
            if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) {
              sectNumber++;
              subSect = 0;
            }
          }
        }
        return c;
      } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException x) {
        throw new IngestException(x);
      } finally {
        if (rdr != null)
          try {
            rdr.close();
          } catch (XMLStreamException e) {
            // not likely.
            LOGGER.info("Error closing XMLReader.", e);
          }
      }
    } catch (IOException e) {
      throw new IngestException(e);
    }
  }

  private static int getLeftSpacesPaddingCount(final String str) {
    final int len = str.length();
    for (int i = 0; i < len; i++) {
      Character c = str.charAt(i);
      if (!isSpaceOrUnixNewline(c))
        return i;
    }

    return len;
  }

  public static boolean isSpaceOrUnixNewline(final Character c) {
    return c.equals(' ') || c.equals('\n');
  }

  private static int getRightSpacesPaddingCount(final String str) {
    final int lenIdx = str.length() - 1;
    for (int i = 0; i < lenIdx; i++) {
      Character c = str.charAt(lenIdx - i);
      if (!isSpaceOrUnixNewline(c))
        return i;
    }

    return lenIdx + 1;
  }

  public static void main(String... args) {
    Thread.setDefaultUncaughtExceptionHandler(new LoggedUncaughtExceptionHandler());
    if (args.length < 2) {
      LOGGER.info("Usage: {} {} {} {}", BoltForumPostIngester.class.getName(), "/path/to/output/folder", "/path/to/bolt/.xml/file", "");
      System.exit(1);
    }

    Path outPath = Paths.get(args[0]);
    Optional.ofNullable(outPath.getParent()).ifPresent(p -> {
      if (!Files.exists(p))
        try {
          Files.createDirectories(p);
        } catch (IOException e) {
          throw new UncheckedIOException(e);
        }
    });

    if (!Files.isDirectory(outPath)) {
      LOGGER.error("Output path must be a directory.");
      System.exit(1);
    }

    BoltForumPostIngester ing = new BoltForumPostIngester();
    for (int i = 1; i < args.length; i++) {
      Path lp = Paths.get(args[i]);
      LOGGER.info("On path: {}", lp.toString());
      try {
        Communication c = ing.fromCharacterBasedFile(lp);
        new WritableCommunication(c).writeToFile(outPath.resolve(c.getId() + ".comm"), true);
      } catch (IngestException | ConcreteException e) {
        LOGGER.error("Caught exception during ingest on file: " + args[i], e);
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy