All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.letsmt.LetsmtDocument Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.letsmt;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import javax.xml.parsers.SAXParser;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

import opennlp.tools.util.XmlUtil;

/**
 * A structure to hold the letsmt document. The documents contain sentences and depending on the
 * source it either contains tokenized text (words) or an un-tokenized sentence string.
 * 

* The format specification can be found * * here. */ public class LetsmtDocument { private static final String ORG_XML_FEATURES_DISALLOW_DOCTYPE_DECL = "http://apache.org/xml/features/disallow-doctype-decl"; public static class LetsmtSentence { private String nonTokenizedText; private String[] tokens; public String getNonTokenizedText() { return nonTokenizedText; } public String[] getTokens() { if (tokens != null) { return Arrays.copyOf(tokens, tokens.length); } return null; } } /** * A {@link DefaultHandler content handler} to receive and process SAX events. */ public static class LetsmtDocumentHandler extends DefaultHandler { private final List sentences = new ArrayList<>(); private final StringBuilder chars = new StringBuilder(); private List tokens = new ArrayList<>(); @Override public void characters(char[] ch, int start, int length) throws SAXException { chars.append(ch, start, length); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); // Note: // words are optional in sentences, if there are no words just the chars have to be captured switch (qName) { case "w": tokens.add(chars.toString().trim()); chars.setLength(0); break; // TODO: The sentence should contain the id, so it can be tracked back to the // place it came from case "s": LetsmtSentence sentence = new LetsmtSentence(); if (tokens.size() > 0) { sentence.tokens = tokens.toArray(new String[0]); tokens = new ArrayList<>(); } else { sentence.nonTokenizedText = chars.toString().trim(); } sentences.add(sentence); chars.setLength(0); } } } private final List sentences; private LetsmtDocument(List sentences) { this.sentences = sentences; } /** * @return Retrieves the sentences of a {@link LetsmtDocument}. */ public List getSentences() { return Collections.unmodifiableList(sentences); } /** * @param letsmtXmlIn The {@link InputStream} referencing the document to parse. * * @return A valid {@link LetsmtDocument} instance. * @throws IOException Thrown if IO errors occurred during loading or parsing. */ static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException { SAXParser saxParser = XmlUtil.createSaxParser(); try { XMLReader xmlReader = saxParser.getXMLReader(); LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler(); xmlReader.setContentHandler(docHandler); xmlReader.setFeature(ORG_XML_FEATURES_DISALLOW_DOCTYPE_DECL, true); xmlReader.parse(new InputSource(letsmtXmlIn)); return new LetsmtDocument(docHandler.sentences); } catch (SAXException e) { throw new IOException("Failed to parse letsmt xml!", e); } } /** * @param file The {@link File} referencing the document to parse. * * @return A valid {@link LetsmtDocument} instance. * @throws IOException Thrown if IO errors occurred during loading or parsing. */ static LetsmtDocument parse(File file) throws IOException { try (InputStream in = new BufferedInputStream(new FileInputStream(file))) { return parse(in); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy