All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.xmlcharfixing.XMLCharFixingStep Maven / Gradle / Ivy

/*===========================================================================
  Copyright (C) 2010-2011 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.xmlcharfixing;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.util.IllegalFormatException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.okapi.common.BOMNewlineEncodingDetector;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.exceptions.OkapiIOException;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.RawDocument;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@UsingParameters(Parameters.class)
public class XMLCharFixingStep extends BasePipelineStep {

	private final Logger LOGGER = LoggerFactory.getLogger(getClass());
	private final Pattern pattern = Pattern.compile("&#(x?)([0-9a-fA-F]+);");
	
	private Parameters params;
	private URI outputURI;
	private int count;

	public XMLCharFixingStep () {
		params = new Parameters();
	}
	
	public String getDescription () {
		return "Fixes invalid characters in XML documents."
			+ " Expects: raw document. Sends back: raw document.";
	}

	public String getName () {
		return "XML Characters Fixing";
	}

	@Override
	public Parameters getParameters () {
		return params;
	}

	@Override
	public void setParameters (IParameters params) {
		this.params = (Parameters)params;
	}
 
	@StepParameterMapping(parameterType = StepParameterType.OUTPUT_URI)
	public void setOutputURI (URI outputURI) {
		this.outputURI = outputURI;
	}
	
	public URI getOutputURI() {
		return outputURI;
	}
	
	@Override
	protected Event handleStartBatch (Event event) {
		count = 0;
		return event;
	}
	
	@Override
	protected Event handleEndBatch (Event event) {
		LOGGER.info("Number of invalid characters replaced = {}", count);
		return event;
	}
	
	@Override
	protected Event handleRawDocument (Event event) {
		FileOutputStream output = null;
		BufferedReader reader = null;
		OutputStreamWriter writer = null;
		
		try {
			RawDocument rd = event.getRawDocument();

			// Open the input
			BOMNewlineEncodingDetector detector = new BOMNewlineEncodingDetector(rd.getStream(), rd.getEncoding());
			detector.detectAndRemoveBom();
			rd.setEncoding(detector.getEncoding());
			String lineBreak = detector.getNewlineType().toString();
			reader = new BufferedReader(new InputStreamReader(detector.getInputStream(), rd.getEncoding()));
			
			// Open the output
			File outFile;
			if ( isLastOutputStep() ) {
				outFile = rd.createOutputFile(outputURI);
			}
			else {
				try {
					outFile = File.createTempFile("~okapi-72_okp-xcf_", ".tmp");
				}
				catch ( Throwable e ) {
					throw new OkapiIOException("Cannot create temporary output.", e);
				}
			}
			output = new FileOutputStream(outFile);
			writer = new OutputStreamWriter(new BufferedOutputStream(output), rd.getEncoding());
			// Write BOM if there was one
			Util.writeBOMIfNeeded(writer, detector.hasUtf8Bom(), rd.getEncoding());

			// In XML 1.0 the valid characters are:
			// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]

			// Process
			StringBuilder tmp = new StringBuilder();
			String line;
			Matcher m = null;
			
			while ( (line = reader.readLine()) != null ) {;
				// Process that line
				tmp.setLength(0);
				tmp.append(line);
				int ch;
				for ( int i=0; i= 0x0020 ) && ( ch <= 0xD7FF )) {
							continue; // Valid
						}
						if (( ch >= 0xE000 ) && ( ch <= 0xFFFF )) {
							continue; // Valid
						}
						if (( ch >= 0x10000 ) && ( ch <= 0x10FFFF )) {
							i++; // Skip extra character for the code-point
							continue; // Valid
						}
						// Else: it's an invalid character
						String repl = String.format(params.getReplacement(), ch);
						tmp.replace(i, i+(ch>0xFFFF ? 2 : 1), repl);
						i += (repl.length()-1); // Move cursor just before next
						count++;
						continue;
					}
				}
				
				// Check for decimal NCRs
				int start = 0;
				do {
					m = pattern.matcher(tmp.toString());
					if ( !m.find(start) ) break; // Done
					try {
						int n = Integer.parseInt(m.group(2), m.group(1).isEmpty() ? 10 : 16);
						start = m.start();
						if ( !isValid(n) ) {
							String repl = String.format(params.getReplacement(), n);
							tmp.replace(start, m.end(), repl);
							start += (repl.length()-m.group().length()); // Move cursor on next
							count++;
						}
						else start = m.end();
					}
					catch ( NumberFormatException e ) {
						LOGGER.error("Invalid NCR: '{}'", m.group());
					}
					
				}
				while ( true );
				
				// Line has been processed, write it back
				writer.write(tmp.toString()+lineBreak);
			}

			// Done: close the files
			reader.close(); reader = null;
			writer.close(); writer = null;
			rd.finalizeOutput();
			
			// Creates the new RawDocument
			event.setResource(new RawDocument(outFile.toURI(), rd.getEncoding(), 
				rd.getSourceLocale(), rd.getTargetLocale()));
		}
		catch ( IllegalFormatException e ) {
			LOGGER.error("Invalid replacement format: '{}'", params.getReplacement());
		}
		catch ( Exception e ) {
			LOGGER.error("Error while processing XML for invalid characters.");
		}
		finally {
			try {
				if ( writer != null ) {
					writer.close();
				}
				if ( reader != null ) {
					reader.close();
				}
			}
			catch ( IOException e ) {
				throw new OkapiIOException("IO error while closing.", e);
			}
		}

		return event;
	}
	
	private boolean isValid (int value ) {
		switch ( value ) {
		case 0x0009: // Tab is allowed
		case 0x000A: // Line-feed is allowed
		case 0x000D: // Carriage-return is allowed
			return true;
		default:
			if (( value >= 0x0020 ) && ( value <= 0xD7FF )) {
				return true; // Valid
			}
			if (( value >= 0xE000 ) && ( value <= 0xFFFF )) {
				return true; // Valid
			}
			if (( value >= 0x10000 ) && ( value <= 0x10FFFF )) {
				return true; // Valid
			}
			// Else: it's an invalid character
			return false;
		}

	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy