opennlp.tools.formats.masc.MascDocumentStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.masc;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import javax.xml.parsers.SAXParser;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.XmlUtil;

public class MascDocumentStream implements ObjectStream {

  private static final Logger logger = LoggerFactory.getLogger(MascDocumentStream.class);
  /**
   * A helper class to parse the header (.hdr) files.
   */
  private static class HeaderHandler extends DefaultHandler {
    private HashMap annotationFiles = null;
    private String file = null;
    private String fType = null;

    protected HashMap getPathList() {
      return annotationFiles;
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes)
        throws SAXException {

      // create a new annotation file and put it in map
      // initialize File object and set path attribute
      if (qName.equalsIgnoreCase("annotation") ||
          qName.equalsIgnoreCase("primaryData")) {
        file = attributes.getValue("loc");
        fType = attributes.getValue("f.id");

        // initialize list
        if (annotationFiles == null) {
          annotationFiles = new HashMap<>();
        }
      }

    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {

      // add annotation object to list
      if (qName.equalsIgnoreCase("annotation") ||
          qName.equalsIgnoreCase("primaryData")) {
        annotationFiles.put(fType, file);
      }

    }

  }
  private final List documents = new LinkedList<>();
  private Iterator documentIterator;
  private final SAXParser saxParser;

  public MascDocumentStream(File mascCorpusDirectory) throws IOException {
    this(mascCorpusDirectory, true, pathname -> pathname.getName().contains(""));
  }

  /**
   * Creates a MascDocumentStream to read the documents from a given directory.
   * Works iff all annotation files mentioned in the headers are present.
   *
   * @param mascCorpusDirectory the directory containing all the MASC files
   * @param searchRecursive     whether the search should go through subdirectories
   * @param fileFilter          a custom file filter to filter out some files or
   *                            null to accept anything
   * @throws IOException if any stage of the stream creation fails
   */
  public MascDocumentStream(File mascCorpusDirectory,
                            boolean searchRecursive, FileFilter fileFilter) throws IOException {

    saxParser = XmlUtil.createSaxParser();

    if (!mascCorpusDirectory.isDirectory()) {
      throw new IOException("Input corpus directory must be a directory " +
          "according to File.isDirectory()!");
    }

    int failedLoads = 0;
    Stack directoryStack = new Stack<>();
    directoryStack.add(mascCorpusDirectory);

    while (!directoryStack.isEmpty()) {
      for (File file : directoryStack.pop().listFiles(fileFilter)) {
        if (file.isFile()) {
          String hdrFilePath = file.getAbsolutePath();

          // look for the header files
          if (hdrFilePath.endsWith(".hdr")) {

            HashMap fileGroup = checkAnnotations(hdrFilePath);
            InputStream f_primary = new BufferedInputStream(
                new FileInputStream(fileGroup.get("f.text")));
            InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
            InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
            InputStream f_s = (fileGroup.containsKey("f.s")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
            InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;

            try {
              documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
                  f_penn, f_s, f_ne));
            } catch (IOException e) {
              logger.error("Failed to parse the file: {}", hdrFilePath, e);
              failedLoads++;
            }
          }

        } else if (searchRecursive && file.isDirectory()) {
          directoryStack.push(file);
        }
      }
    }

    logger.info("Documents loaded: {}", documents.size());
    if (failedLoads > 0) {
      logger.info("Failed loading {} documents.", failedLoads);
    }
    reset();

  }

  /**
   * Check that all annotation files mentioned in the header are present
   *
   * @param path The path to header
   * @throws IOException If corpus integrity is violated
   */
  private HashMap checkAnnotations(String path) throws IOException {
    HeaderHandler handler = new HeaderHandler();
    HashMap fileGroup = new HashMap<>();
    File hdrFile = new File(path);
    try {
      saxParser.parse(hdrFile, handler);
    } catch (SAXException e) {
      throw new IOException("Invalid corpus format. " +
          "Could not parse the header: " + path);
    }
    HashMap annotationFiles = handler.getPathList();

    String pathToFolder = hdrFile.getParentFile().getAbsolutePath();
    for (Map.Entry annotation : annotationFiles.entrySet()) {
      File file = new File(pathToFolder, annotation.getValue());
      if (!(file.isFile() && file.exists())) {
        throw new IOException("Corpus integrity violated. " +
            "Annotation file " + file.getAbsolutePath() + " is missing.");
      }

      fileGroup.put(annotation.getKey(), file);

    }

    return fileGroup;

  }

  /**
   * Reset the reading of all documents to the first sentence.
   * Reset the corpus to the first document.
   */
  @Override
  public void reset() {
    for (MascDocument doc : documents) {
      doc.reset();
    }
    documentIterator = documents.iterator();
  }

  /**
   * Return the next document. Client needs to check if this document has the necessary annotations.
   *
   * @return A corpus document with all its annotations.
   * @throws IOException if anything goes wrong.
   */
  @Override
  public MascDocument read() throws IOException {

    MascDocument doc = null;

    if (documentIterator.hasNext()) {
      doc = documentIterator.next();
    }

    return doc;
  }

  /**
   * Remove the corpus from the memory.
   */
  @Override
  public void close() {
    documentIterator = null;
  }

}