
net.sf.okapi.steps.encodingconversion.EncodingConversionStep Maven / Gradle / Ivy
/*===========================================================================
Copyright (C) 2009-2011 by the Okapi Framework contributors
-----------------------------------------------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
===========================================================================*/
package net.sf.okapi.steps.encodingconversion;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.okapi.common.BOMNewlineEncodingDetector;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.HTMLCharacterEntities;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.exceptions.OkapiIOException;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.RawDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@UsingParameters(Parameters.class)
public class EncodingConversionStep extends BasePipelineStep {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final int MAXBUF = 1024;
private Parameters params;
private String outFormat;
private CharsetEncoder outputEncoder;
private boolean useCER;
private CharBuffer buffer;
private Pattern pattern;
private Pattern xmlEncDecl;
private Pattern xmlDecl;
private Pattern htmlEncDecl;
private Pattern htmlDecl;
private Pattern htmlHead;
private String prevBuf;
private boolean isXML;
private boolean isHTML;
private URI outputURI;
private URI inputURI;
private String outputEncoding;
private HTMLCharacterEntities entities;
public EncodingConversionStep () {
params = new Parameters();
entities = new HTMLCharacterEntities();
}
@StepParameterMapping(parameterType = StepParameterType.OUTPUT_URI)
public void setOutputURI (URI outputURI) {
this.outputURI = outputURI;
}
@StepParameterMapping(parameterType = StepParameterType.INPUT_URI)
public void setInputURI (URI inputURI) {
this.inputURI = inputURI;
}
@StepParameterMapping(parameterType = StepParameterType.OUTPUT_ENCODING)
public void setOutputEncoding (String outputEncoding) {
this.outputEncoding = outputEncoding;
}
public String getDescription () {
return "Convert the character set encoding of a text-based file."
+ " Expects: raw document. Sends back: raw document.";
}
public String getName () {
return "Encoding Conversion";
}
@Override
public IParameters getParameters () {
return params;
}
@Override
public void setParameters (IParameters params) {
this.params = (Parameters)params;
}
@Override
protected Event handleStartBatch (Event event) {
buffer = CharBuffer.allocate(MAXBUF);
// Pre-compile the patterns for declaration detection
xmlEncDecl = Pattern.compile("((<\\?xml)(.*?)(encoding(\\s*?)=(\\s*?)(\\'|\\\")))", Pattern.DOTALL);
xmlDecl = Pattern.compile("((<\\?xml)(.*?)(version(\\s*?)=(\\s*?)(\\'|\\\")))", Pattern.DOTALL);
htmlEncDecl = Pattern.compile("(]*?)(content)(\\s*?)=(\\s*?)[\\'|\\\"](\\s*?)text/html(\\s*?);(\\s*?)charset(\\s*?)=(\\s*?)([^\\s]+?)(\\s|\\\"|\\')",
Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
htmlDecl = Pattern.compile("(", Pattern.CASE_INSENSITIVE);
// Pre-compile pattern for un-escaping
String tmp = "";
if ( params.getUnescapeNCR() ) {
tmp += "([0-9]*?);|[xX]([0-9a-fA-F]*?);";
}
if ( params.getUnescapeCER() ) {
if ( tmp.length() > 0 ) tmp += "|";
tmp += "(&\\w*?;)";
}
if ( params.getUnescapeJava() ) {
if ( tmp.length() > 0 ) tmp += "|";
tmp += "(\\\\[Uu]([0-9a-fA-F]{1,4}))";
}
if ( tmp.length() > 0 ) {
pattern = Pattern.compile(tmp, Pattern.CASE_INSENSITIVE);
entities.ensureInitialization(false);
}
else pattern = null;
useCER = false;
switch ( params.getEscapeNotation() ) {
case Parameters.ESCAPE_CER:
useCER = true;
entities.ensureInitialization(false);
outFormat = "%X;"; // Here outFormat is used only if no CER can be used
break;
case Parameters.ESCAPE_JAVAL:
outFormat = "\\u%04x";
break;
case Parameters.ESCAPE_JAVAU:
outFormat = "\\u%04X";
break;
case Parameters.ESCAPE_NCRDECI:
outFormat = "%d;";
break;
case Parameters.ESCAPE_NCRHEXAL:
outFormat = "%x;";
break;
case Parameters.ESCAPE_USERFORMAT:
outFormat = params.getUserFormat();
break;
case Parameters.ESCAPE_NCRHEXAU:
default:
outFormat = "%X;";
break;
}
return event;
}
@Override
protected Event handleRawDocument (Event event) {
RawDocument rawDoc = (RawDocument)event.getResource();
BufferedReader reader = null;
OutputStreamWriter writer = null;
try {
// Try to detect the type of file from extension
isXML = false;
isHTML = false;
String ext = Util.getExtension(inputURI.getPath());
if ( !Util.isEmpty(ext) ) {
isHTML = (ext.toLowerCase().indexOf(".htm")==0);
isXML = ext.equalsIgnoreCase(".xml");
}
//=== Try to detect the encoding
InputStream is = rawDoc.getStream();
// First: guess from a possible BOM
BOMNewlineEncodingDetector detector = new BOMNewlineEncodingDetector(is, rawDoc.getEncoding());
detector.detectAndRemoveBom();
rawDoc.setEncoding(detector.getEncoding());
String inputEncoding = rawDoc.getEncoding();
// Then try internal detection for XML/HTML type files
if ( !detector.isAutodetected() ) {
reader = new BufferedReader(rawDoc.getReader());
reader.read(buffer);
String detectedEncoding = checkDeclaration(inputEncoding);
if ( !detectedEncoding.equalsIgnoreCase(inputEncoding) ) {
inputEncoding = detectedEncoding;
}
reader.close();
}
// Open the input document
//TODO: Where did we reset the reader - can't call this twice unless we reset it
reader = new BufferedReader(rawDoc.getReader());
logger.info("Input encoding: {}", inputEncoding);
// Open the output document
File outFile;
if ( isLastOutputStep() ) {
outFile = rawDoc.createOutputFile(outputURI);
}
else {
try {
outFile = File.createTempFile("~okapi-40_okp-enc_", ".tmp");
}
catch ( Throwable e ) {
throw new OkapiIOException("Cannot create temporary output.", e);
}
}
writer = new OutputStreamWriter(new BufferedOutputStream(
new FileOutputStream(outFile)), outputEncoding);
outputEncoder = Charset.forName(outputEncoding).newEncoder();
logger.info("Output encoding: {}", outputEncoding);
Util.writeBOMIfNeeded(writer, params.getBOMonUTF8(), outputEncoding);
int n;
CharBuffer tmpBuf = CharBuffer.allocate(1);
ByteBuffer encBuf;
boolean canEncode;
boolean checkDeclaration = true;
while ( true ) {
buffer.clear();
// Start with previous buffer remains if needed
if ( prevBuf != null ) {
buffer.append(prevBuf);
}
// Read the next block
n = reader.read(buffer);
// Check if we need to stop here
boolean needSplitCheck = true;
if ( n == -1 ) {
// Make sure we do not start an endless loop by
// re-checking the last previous buffer
if ( prevBuf != null ) {
needSplitCheck = false;
prevBuf = null;
buffer.limit(buffer.position());
}
else break; // No previous, no read: Done
}
if ( checkDeclaration ) {
checkDeclaration(inputEncoding);
checkDeclaration = false;
}
// Un-escape if requested
if ( pattern != null ) {
if ( needSplitCheck ) checkSplitSequence();
unescape();
}
// Output
n = buffer.position();
buffer.position(0);
for ( int i=0; i 127 )) || !canEncode ) {
boolean fallBack = false;
// Write escape form
if ( useCER ) {
String tmp = entities.getName(buffer.get(i));
if ( tmp == null ) fallBack = true;
else writer.write("&"+tmp+";");
}
else {
if ( params.getUseBytes() ) { // Escape bytes
if ( canEncode ) {
tmpBuf.put(0, buffer.get(i));
tmpBuf.position(0);
encBuf = outputEncoder.encode(tmpBuf);
for ( int j=0; j");
}
}
}
// Look for HTML declarations
m = htmlEncDecl.matcher(text);
if ( m.find() ) {
isHTML = true;
// Group 11 contains the encoding name
encoding = m.group(11);
// Replace it by the new encoding
int n = text.indexOf(encoding, m.start());
text.replace(n, n+encoding.length(), outputEncoding);
}
else if ( isHTML ) { // No HTML encoding found, but try to update if it was seen as HTML from extension
// Try to place it after
m = htmlHead.matcher(text);
if ( m.find() ) {
text.insert(m.end(), String.format(
"",
outputEncoding));
}
else { // If no , try
m = htmlDecl.matcher(text);
if ( m.find() ) {
int n = text.indexOf(">", m.end());
if ( n != -1 ) {
text.insert(n+1, String.format(
"",
outputEncoding));
}
}
}
}
// Convert the string back to a CharBuffer
int len = text.length();
// Make sure we have room for added characters
if ( len > buffer.capacity() ) {
buffer = CharBuffer.allocate(len);
}
else {
buffer.clear();
}
buffer.append(text.toString());
buffer.limit(len);
return encoding;
}
private void checkSplitSequence () {
int len = buffer.position();
buffer.position(0);
// Search for the first & or \ in the last 10 (or less) characters
prevBuf = null;
int j = 0;
for ( int i=len-1; ((i>=0) && (j<10)); i-- ) {
if (( buffer.charAt(i) == '&' ) || ( buffer.charAt(i) == '\\' )) {
prevBuf = buffer.subSequence(i, len).toString();
len = i;
break;
}
j++;
}
buffer.position(len);
buffer.limit(len);
}
private void unescape () {
int len = buffer.position();
buffer.position(0);
Matcher m = pattern.matcher(buffer);
int pos = 0;
StringBuilder tmp = new StringBuilder(len);
String seq = null;
while ( m.find(pos) ) {
// Copy any previous text
if ( m.start() > pos ) {
// Get text before
tmp.append(buffer.subSequence(pos, m.start()));
}
pos = m.end();
// Treat the escape sequence
seq = m.group();
int value = -1;
int uIndex = seq.indexOf('u');
if ( seq.indexOf('x') == 2 ) {
// Hexadecimal NCR "HHH;"
value = Integer.parseInt(seq.substring(3, seq.length()-1), 16);
}
else if (( uIndex == 1 ) && ( seq.charAt(uIndex-1) == '\\' )) {
// Java style "\ and uHHH"
value = Integer.parseInt(seq.substring(2), 16);
}
else if ( seq.indexOf('#') == 1 ) {
// Decimal NCR "DDD;"
value = Integer.parseInt(seq.substring(2, seq.length()-1));
}
else {
// Character entity reference: &NAME;
seq = seq.substring(1, seq.length()-1);
// Unidentified is -1: leave it like that
value = entities.lookupName(seq);
}
// Append the parsed escape
switch ( value ) {
case -1: // Unidentified (includes e.g. ')
case 0x22: // "
case 0x27: // '
case 0x26: // &
case 0x3C: // <
case 0x3E: // >
tmp.append(m.group()); // Keep those escaped
break;
default:
// Un-escape all others
tmp.append((char)value);
}
}
// Copy last part and re-build the buffer
if ( seq != null ) { // We had at least one match
if ( pos < len ) {
// Get text before
tmp.append(buffer.subSequence(pos, len));
}
// Reset the buffer
buffer.clear();
buffer.append(tmp.toString(), 0, tmp.length());
}
else { // Else: nothing to un-escape
buffer.position(len);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy