All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.letsmt.LetsmtDocument Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.letsmt;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import javax.xml.parsers.SAXParser;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

import opennlp.tools.util.XmlUtil;

/**
 * A structure to hold the letsmt document. The documents contains sentences and depending on the
 * source it either contains tokenized text (words) or an un-tokenized sentence string.
 * 

* The format specification can be found * here. */ public class LetsmtDocument { public static class LetsmtSentence { private String nonTokenizedText; private String[] tokens; public String getNonTokenizedText() { return nonTokenizedText; } public String[] getTokens() { if (tokens != null) { return Arrays.copyOf(tokens, tokens.length); } return null; } } // define a content handler to receive the sax events ... public static class LetsmtDocumentHandler extends DefaultHandler { private List sentences = new ArrayList<>(); private StringBuilder chars = new StringBuilder(); private List tokens = new ArrayList<>(); @Override public void characters(char[] ch, int start, int length) throws SAXException { chars.append(ch, start, length); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); // Note: // words are optional in sentences, if there are no words just the chars have to be captured switch (qName) { case "w": tokens.add(chars.toString().trim()); chars.setLength(0); break; // TODO: The sentence should contain the id, so it can be tracked back to the // place it came from case "s": LetsmtSentence sentence = new LetsmtSentence(); if (tokens.size() > 0) { sentence.tokens = tokens.toArray(new String[tokens.size()]); tokens = new ArrayList<>(); } else { sentence.nonTokenizedText = chars.toString().trim(); } sentences.add(sentence); chars.setLength(0); } } } private List sentences = new ArrayList<>(); private LetsmtDocument(List sentences) { this.sentences = sentences; } public List getSentences() { return Collections.unmodifiableList(sentences); } static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException { SAXParser saxParser = XmlUtil.createSaxParser(); try { XMLReader xmlReader = saxParser.getXMLReader(); LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler(); xmlReader.setContentHandler(docHandler); xmlReader.parse(new InputSource(letsmtXmlIn)); return new LetsmtDocument(docHandler.sentences); } catch (SAXException e) { throw new IOException("Failed to parse letsmt xml!", e); } } static LetsmtDocument parse(File file) throws IOException { try (InputStream in = new FileInputStream(file)) { return parse(in); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy