opennlp.tools.formats.nkjp.NKJPTextDocument Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.nkjp;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import opennlp.tools.util.XmlUtil;

/**
 * The National corpus of Polish (NKJP) format.
 * 
 * Information about the format are found on this
 * web site.
 * 

 * A 1-million word corpus can be found on this
 * 
 *   web site.
 * 
 * The NKJP schema can be found
 * here.
 */
public class NKJPTextDocument {

  private Map divtypes;

  private Map>> texts;

  NKJPTextDocument() {
    divtypes = new HashMap<>();
    texts = new HashMap<>();
  }

  NKJPTextDocument(Map divtypes, Map>> texts) {
    this();
    this.divtypes = divtypes;
    this.texts = texts;
  }

  public static NKJPTextDocument parse(InputStream is) throws IOException {
    Map divtypes = new HashMap<>();
    Map>> texts = new HashMap<>();

    try {
      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
      Document doc = docBuilder.parse(is);

      XPathFactory xPathfactory = XPathFactory.newInstance();
      XPath xpath = xPathfactory.newXPath();

      final XPathExpression TEXT_NODES_EXAMPLE = xpath.compile("/teiCorpus/TEI/text/group/text");
      final XPathExpression TEXT_NODES_SAMPLE = xpath.compile("/teiCorpus/TEI/text");
      final XPathExpression DIV_NODES = xpath.compile("./body/div");
      final XPathExpression PARA_NODES = xpath.compile("./p|./ab");

      doc.getDocumentElement().normalize();
      String root = doc.getDocumentElement().getNodeName();

      if (!root.equalsIgnoreCase("teiCorpus")) {
        throw new IOException("Expected root node " + root);
      }

      String current_text = "";
      NodeList textnl = (NodeList) TEXT_NODES_EXAMPLE.evaluate(doc, XPathConstants.NODESET);
      if (textnl.getLength() == 0) {
        textnl = (NodeList) TEXT_NODES_SAMPLE.evaluate(doc, XPathConstants.NODESET);
      }

      for (int i = 0; i < textnl.getLength(); i++) {
        Node textnode = textnl.item(i);
        current_text = attrib(textnode, "xml:id", true);

        Map> current_divs = new HashMap<>();
        NodeList divnl = (NodeList) DIV_NODES.evaluate(textnode, XPathConstants.NODESET);
        for (int j = 0; j < divnl.getLength(); j++) {
          Node divnode = divnl.item(j);
          String divtype = attrib(divnode, "type", false);
          String divid = attrib(divnode, "xml:id", true);
          divtypes.put(divid, divtype);

          Map current_paras = new HashMap<>();
          NodeList paranl = (NodeList) PARA_NODES.evaluate(divnode, XPathConstants.NODESET);

          for (int k = 0; k < paranl.getLength(); k++) {
            Node pnode = paranl.item(k);
            String pid = attrib(pnode, "xml:id", true);

            if (pnode.getChildNodes().getLength() != 1
                && !pnode.getFirstChild().getNodeName().equals("#text")) {
              throw new IOException("Unexpected content in p element " + pid);
            }

            String ptext = pnode.getTextContent();
            current_paras.put(pid, ptext);
          }

          current_divs.put(divid, current_paras);
        }

        texts.put(current_text, current_divs);
      }

    } catch (SAXException | XPathExpressionException | IOException e) {
      throw new IOException("Failed to parse NKJP document", e);
    }
    return new NKJPTextDocument(divtypes, texts);
  }

  static NKJPTextDocument parse(File file) throws IOException {
    try (InputStream in = new BufferedInputStream(new FileInputStream(file))) {
      return parse(in);
    }
  }

  Map getDivtypes() {
    return Collections.unmodifiableMap(this.divtypes);
  }

  Map>> getTexts() {
    return Collections.unmodifiableMap(this.texts);
  }

  /**
   * Segmentation etc. is done only in relation to the paragraph,
   * which are unique within a document. This is to simplify
   * working with the paragraphs within the document
   * 
   * @return A map of paragraph IDs and their text.
   */
  Map getParagraphs() {
    Map paragraphs = new HashMap<>();
    for (String dockey : texts.keySet()) {
      for (String divkey : texts.get(dockey).keySet()) {
        for (String pkey : texts.get(dockey).get(divkey).keySet()) {
          paragraphs.put(pkey, texts.get(dockey).get(divkey).get(pkey));
        }
      }
    }
    return paragraphs;
  }

  /**
   * Helper method to get the value of an attribute.
   * 
   * @param n The {@link Node} to be processed.
   * @param attrib The name of the attribute.
   * @param required Whether the attribute is required or not.
   *
   * @return The value of the attribute, or null if not required and not present
   * @throws IOException Thrown if IO errors occurred.
   */
  private static String attrib(Node n, String attrib, boolean required) throws IOException {
    if (required && (n.getAttributes() == null || n.getAttributes().getLength() == 0)) {
      throw new IOException("Missing required attributes in node " + n.getNodeName());
    }
    if (n.getAttributes().getNamedItem(attrib) != null) {
      return n.getAttributes().getNamedItem(attrib).getTextContent();
    } else {
      if (required) {
        throw new IOException("Required attribute \"" + attrib + "\" missing in node " + n.getNodeName());
      } else {
        return null;
      }
    }
  }
}