net.sf.okapi.steps.ttxsplitter.TTXSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of okapi-lib Show documentation
There is a newer version: 1.47.0
/*===========================================================================
  Copyright (C) 2014 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.ttxsplitter;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Stack;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import javax.xml.transform.stream.StreamSource;

import net.sf.okapi.common.LocaleId;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TTXSplitter {

	private final Logger logger = LoggerFactory.getLogger(getClass());
	
	private final TTXSplitterParameters params;
	
	private String srcLang;
	private WordCounter counter;
	private long wordCount;
	private long partWC;
	private XMLEventReader reader;
	private int outputState;
	private XMLOutputFactory outFact;
	private XMLEventFactory  evtFact;
	private XMLEventWriter writer;
	private int part;
	private String outputPath;
	private Stack topStack;
	private Stack stack;
	private int level;
	private ArrayList endEvents;
	
	public TTXSplitter (TTXSplitterParameters params) {
		this.params = params;
	}
	
	public void initialize (String srcLang) {
		this.srcLang = srcLang;
		counter = new WordCounter(LocaleId.fromString(srcLang));
	}
	
	public void split (URI inputURI) {
		try {
			outputState = -1;
			process(inputURI);
			if ( wordCount <= params.getPartCount() ) {
				logger.warn("Word count for this document is {} (less than the {} files requested): No split was perfomed.",
					wordCount, params.getPartCount());
				return;
			}
			//TODO: have a param threshold under which we do not split
			
			partWC = wordCount / params.getPartCount();
			logger.info("Word count = {}: {} files of about {} words each.", wordCount, params.getPartCount(), partWC);
			
			// Generate the output files
			outputState = 0;
			part = 0;
			outputPath = new File(inputURI).getAbsolutePath();
			stack = new Stack<>();
			topStack = new Stack<>();
			process(inputURI);
		}
		catch ( Throwable e ) {
			e.printStackTrace();
			logger.error("Could not split the TTX document:\n{}", e.getLocalizedMessage());
		}
	}
	
	private void process (URI inputURI)
		throws XMLStreamException, MalformedURLException, IOException
	{
		try {
			XMLInputFactory inpFact = XMLInputFactory.newInstance();
			inpFact.setProperty(XMLInputFactory.IS_COALESCING, true);
			// security concern. Turn off DTD processing
			// https://www.owasp.org/index.php/XML_External_Entity_%28XXE%29_Processing
			inpFact.setProperty(XMLInputFactory.SUPPORT_DTD, false);			
			
			StreamSource source = new StreamSource(new BufferedInputStream(inputURI.toURL().openStream()));
			reader = inpFact.createXMLEventReader(source);
	
			if ( outputState > -1 ) {
				outFact = XMLOutputFactory.newInstance();
				evtFact = XMLEventFactory.newInstance();
				// Create the closing events
				endEvents = new ArrayList<>();
				endEvents.add(evtFact.createEndElement(new QName("", "Raw"), null));
				endEvents.add(evtFact.createEndElement(new QName("", "Body"), null));
				endEvents.add(evtFact.createEndElement(new QName("", "TRADOStag"), null));
			}
			
			level = 0;
			wordCount = 0;
			boolean inText = false;
			boolean skip = false;
			StringBuilder sb = new StringBuilder();
			String name;
			
			while ( reader.hasNext() ) {
				XMLEvent event = reader.nextEvent();
				switch ( event.getEventType() ) {
				
				case XMLEvent.START_ELEMENT:
					level++;
					StartElement elem = event.asStartElement();
					switch ( elem.getName().getLocalPart() ) {
					case "Tuv":
						String lang = getAttributeValue(elem, "Lang");
						if ( srcLang.equals(lang) ) skip = false;
						else skip = true;
						break;
					case "UserSettings":
						initialize(getAttributeValue(elem, "SourceLanguage"));
						break;
					case "ut":
						if ( writer != null ) writer.add(event);
						skipElement("ut"); level--;
						continue;
					case "Raw":
						inText = true;
						if ( outputState == 0 ) {
							topStack.push(event);
							outputState = 1; // We are done collecting the top events
						}
						if ( outputState > -1 ) {
							startNewOutput();
							continue;
						}
						break;
					case "df":
						if ( writer != null ) stack.push(event);
						break;
					}
					break;
					
				case XMLEvent.END_ELEMENT:
					level--;
					name = event.asEndElement().getName().getLocalPart();
					if ( name.equals("df") ) {
						if ( writer != null ) stack.pop();
					}
					if (( level == 3 ) && inText ) {
						if ( sb.length() > 0 ) {
							wordCount += counter.getWordCount(sb.toString());
							sb.setLength(0);
						}
						// If we went over the part count we need to start the next output file
						if (( wordCount >= partWC ) && ( writer != null ) && ( part < params.getPartCount() )) {
							writer.add(event);
							endOutput();
							startNewOutput();
							continue;
						}
					}
					break;
				
				case XMLEvent.CHARACTERS:
					if ( inText && !skip ) {
						sb.append(event.asCharacters().getData());
					}
				}
				
				if ( outputState == 0 ) {
					topStack.push(event);
				}
				// In all cases: if we have the writer set we output the event
				if ( writer != null ) {
					writer.add(event);
				}
				
			}
		}
		finally {
			close();
		}
	}
	
	private String getAttributeValue (StartElement elem,
		String name)
	{
		Attribute attr = elem.getAttributeByName(new QName("", name));
		if ( attr == null ) return null;
		return attr.getValue();
	}
	
	private void endOutput ()
		throws XMLStreamException
	{
		if ( writer != null ) {
			// Do we have un-closed elements like df
			if ( !stack.isEmpty() ) {
				for ( int i=0; i