net.sf.okapi.steps.msbatchtranslation.MSBatchTranslationStep Maven / Gradle / Ivy
/*===========================================================================
Copyright (C) 2011-2018 by the Okapi Framework contributors
-----------------------------------------------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
===========================================================================*/
package net.sf.okapi.steps.msbatchtranslation;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.okapi.common.Base64;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.IResource;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
import net.sf.okapi.common.annotation.GenericAnnotation;
import net.sf.okapi.common.annotation.GenericAnnotationType;
import net.sf.okapi.common.annotation.GenericAnnotations;
import net.sf.okapi.common.exceptions.OkapiIOException;
import net.sf.okapi.common.filterwriter.TMXWriter;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.query.QueryResult;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.MultiEvent;
import net.sf.okapi.common.resource.PipelineParameters;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.connectors.microsoft.MicrosoftMTConnector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@UsingParameters(Parameters.class)
public class MSBatchTranslationStep extends BasePipelineStep {
private final Logger logger = LoggerFactory.getLogger(getClass());
public static final String DOMAINVAR = "${domain}";
private final static int MAXEVENTS = 20;
private Parameters params;
private TMXWriter tmxWriter;
private LinkedList events;
private int maxEvents = MAXEVENTS;
private MicrosoftMTConnector conn;
private LocaleId sourceLocale;
private LocaleId targetLocale;
private String rootDir;
private String inputRootDir;
private Map attributes;
private boolean needReset;
private boolean sendTmx;
private String tmxOutputPath;
private int batchInputCount;
private int count;
private String tempCategory;
private String computedCategory;
public MSBatchTranslationStep () {
params = new Parameters();
}
private void closeAndClean () {
if ( tmxWriter != null ) {
tmxWriter.writeEndDocument();
tmxWriter.close();
tmxWriter = null;
}
if ( events != null ) {
events.clear();
events = null;
}
}
@Override
public String getDescription () {
return "Annotates text units with Microsoft Translator matches or/and creates a TM from them."
+ " Expects: filter events. Sends back: filter events.";
}
@Override
public String getName () {
return "Microsoft Batch Translation";
}
@StepParameterMapping(parameterType = StepParameterType.SOURCE_LOCALE)
public void setSourceLocale (LocaleId sourceLocale) {
this.sourceLocale = sourceLocale;
}
@StepParameterMapping(parameterType = StepParameterType.TARGET_LOCALE)
public void setTargetLocale (LocaleId targetLocale) {
this.targetLocale = targetLocale;
}
@StepParameterMapping(parameterType = StepParameterType.ROOT_DIRECTORY)
public void setRootDirectory (String rootDir) {
this.rootDir = rootDir;
}
@StepParameterMapping(parameterType = StepParameterType.INPUT_ROOT_DIRECTORY)
public void setInputRootDirectory (String inputRootDir) {
this.inputRootDir = inputRootDir;
}
@StepParameterMapping(parameterType = StepParameterType.BATCH_INPUT_COUNT)
public void setBatchInputCount (int batchInputCount) {
this.batchInputCount = batchInputCount;
}
@Override
public Parameters getParameters () {
return params;
}
@Override
public void setParameters (IParameters params) {
this.params = (Parameters)params;
}
private String computeCategory (String initialValue) {
String parsedKey;
//--Check if Predefined--
if (( initialValue != null ) && (initialValue.contains("@@@") || initialValue.contains("***"))) {
//--parse key--
Pattern pattern;
if (initialValue.contains("@@@")){
pattern = Pattern.compile("@@@(.+?)@@@");
}else{
pattern = Pattern.compile("\\*\\*\\*(.+?)\\*\\*\\*");
}
Matcher matcher = pattern.matcher(initialValue);
if ( matcher.find() ) {
parsedKey = matcher.group(1);
}
else {
//--can't parse string--
logger.error("Not able to parse predefined engine string '{}'. Using empty category.", initialValue);
return "";
}
//--locate in properties file--
String propFile = params.getConfigPath();
if ( propFile != null ) {
propFile = propFile.trim();
propFile = Util.fillRootDirectoryVariable(propFile, rootDir);
propFile = Util.fillInputRootDirectoryVariable(propFile, inputRootDir);
propFile = LocaleId.replaceVariables(propFile, sourceLocale, targetLocale);
}
if ( !Util.isEmpty(propFile) ) {
Properties prop = new Properties();
String keyWithLoc;
try {
prop.load(new FileInputStream(propFile));
keyWithLoc = parsedKey + "." + targetLocale.toJavaLocale().getLanguage().toUpperCase();
String category = prop.getProperty(keyWithLoc);
if ( category != null ) {
logger.info("Found engine '{}'. Using category '{}'.", keyWithLoc, category);
return Base64.decodePassword(category);
}
else {
logger.warn("Cannot find engine '{}'. Try fallback.", keyWithLoc);
//--recursively try fallback--
int index = parsedKey.lastIndexOf('.');
while ( index != -1 ) {
parsedKey = parsedKey.substring(0, index);
keyWithLoc = parsedKey + "." + targetLocale.toJavaLocale().getLanguage().toUpperCase();
category = prop.getProperty(keyWithLoc);
if ( category != null ) {
logger.info("Found fallback engine '{}'. Using category '{}'.", keyWithLoc, category);
return Base64.decodePassword(category);
}
index = parsedKey.lastIndexOf('.');
}
}
}
catch ( IOException ex ) {
throw new OkapiIOException("Can't load: "+propFile+".");
}
}
else {
//--no property file specified--
throw new OkapiIOException("No engine mapping property file specified. Using empty category.");
}
logger.warn("No engine found. Using empty category.");
return "";
}
else {
//--Use the specified category
if ( initialValue == null ) initialValue = ""; // Null means empty
logger.info("Using category '{}'.", initialValue);
return initialValue;
}
}
@Override
protected Event handleStartBatch (Event event) {
count = 0;
events = new LinkedList<>();
maxEvents = params.getMaxEvents();
if (( maxEvents < 1 ) || ( maxEvents > 1000 )) maxEvents = MAXEVENTS;
// Initialize the engine
conn = new MicrosoftMTConnector();
net.sf.okapi.connectors.microsoft.Parameters prm = conn.getParameters();
prm.setAzureKey(params.getAzureKey());
// Initializes the category: if it has the variable for Domain: set it to null, it'll be set later
tempCategory = params.getCategory();
if ( tempCategory.contains(DOMAINVAR) ) {
computedCategory = null;
prm.setCategory(""); // Use empty category for any entries before the batch with the first Domain annotation.
}
else { // Otherwise: set it with the real value now.
computedCategory = computeCategory(params.getCategory());
prm.setCategory(computedCategory);
}
conn.setLanguages(sourceLocale, targetLocale);
conn.setMaximumHits(params.getMaxMatches());
conn.setThreshold(params.getThreshold());
// Set sendTmxFlag
sendTmx = params.getSendTmx() && params.getMakeTmx();
// Create the TMX output if requested
if ( params.getMakeTmx() ) {
tmxOutputPath = Util.fillRootDirectoryVariable(params.getTmxPath(), rootDir);
tmxOutputPath = Util.fillInputRootDirectoryVariable(tmxOutputPath, inputRootDir);
tmxOutputPath = LocaleId.replaceVariables(tmxOutputPath, sourceLocale, targetLocale);
tmxWriter = new TMXWriter(tmxOutputPath);
tmxWriter.writeStartDocument(sourceLocale, targetLocale, getClass().getCanonicalName(),
"1", "sentence", null, "unknown");
// Set the attributes to write in the TMX
attributes = new Hashtable<>();
if ( params.getMarkAsMT() ) {
attributes.put("creationid", Util.MTFLAG);
}
attributes.put("Txt::Origin", "Microsoft-Translator");
}
return event;
}
@Override
public Event handleEvent (Event event) {
Event tempEvent;
switch ( event.getEventType() ) {
case START_BATCH:
return handleStartBatch(event);
// Events to store until the next trigger
case TEXT_UNIT:
case DOCUMENT_PART:
case START_GROUP:
case END_GROUP:
case START_SUBFILTER:
case END_SUBFILTER:
tempEvent = storeAndPossiblyProcess(event, false);
// Send NOOPs if passing tmx
if ( !sendTmx ) {
// Store and possibly trigger
return tempEvent;
}
break;
// Events that force the trigger if needed
case CUSTOM:
case MULTI_EVENT:
case START_SUBDOCUMENT: // Could have text units between start document and sub-document
case END_SUBDOCUMENT:
tempEvent = storeAndPossiblyProcess(event, true);
// Send NOOPs if passing tmx
if ( !sendTmx ) {
return tempEvent;
}
break;
// Events that should clean up
case CANCELED:
case END_BATCH:
closeAndClean();
break;
case START_BATCH_ITEM:
// Keep track of files processed
count ++;
break;
case END_DOCUMENT:
tempEvent = storeAndPossiblyProcess(event, true);
if ( !sendTmx ) {
return tempEvent;
}else{
if ( count >= batchInputCount ) {
closeAndClean();
return generateAltOutput(tmxOutputPath, "UTF-8", sourceLocale, targetLocale, "okf_tmx");
}
}
break;
// Events before any storing or after triggers
case END_BATCH_ITEM:
case RAW_DOCUMENT:
case START_DOCUMENT:
case NO_OP:
case PIPELINE_PARAMETERS:
break; // Do nothing special
}
// Send NOOPs if passing tmx
if ( sendTmx ) {
return Event.createNoopEvent();
}
return event;
}
private Event processEvents () {
// Do the translations
getTranslations();
// Translations are done
// Now we sent all the stored events down the pipeline
needReset = true; // To reset the list next time around
return new Event(EventType.MULTI_EVENT, new MultiEvent(events));
}
private Event storeAndPossiblyProcess (Event event,
boolean mustProcess)
{
// Reset if needed
if ( needReset ) {
needReset = false;
events.clear();
}
// Add the event
events.add(event);
// If needed: Compute the category on the first domain found
if (( computedCategory == null ) && event.isTextUnit() ) {
ITextUnit tu = event.getTextUnit();
// Get the ITS Domain information from the text unit
GenericAnnotations anns = tu.getAnnotation(GenericAnnotations.class);
if ( anns != null ) {
GenericAnnotation ann = anns.getFirstAnnotation(GenericAnnotationType.DOMAIN);
if ( ann != null ) {
String domain = ann.getString(GenericAnnotationType.DOMAIN_VALUE);
if ( domain != null ) {
logger.info("First domain value ('{}') detected on text unit id='{}'.", domain, tu.getId());
// Use tempCategory that holds the initial value
computedCategory = computeCategory(tempCategory.replace(DOMAINVAR, domain));
net.sf.okapi.connectors.microsoft.Parameters prm = conn.getParameters();
prm.setCategory(computedCategory);
}
}
}
}
// And trigger the process if needed
if ( mustProcess || ( events.size() >= maxEvents )) {
return processEvents();
}
// Else, if we just store this event, we pass a no-operation event down for now
return Event.createNoopEvent();
}
private void getTranslations () {
if ( events.isEmpty() ) {
return; // Nothing to do
}
// Process the text units to leverage
ArrayList fragments = new ArrayList<>();
ArrayList segIds = new ArrayList<>();
ArrayList tuIds = new ArrayList<>();
// Gather the text fragment to translate
for ( Event event : events ) {
if ( event.isTextUnit() ) {
ITextUnit tu = event.getTextUnit();
// Skip non-translatable entries
if ( !tu.isTranslatable() ) continue;
TextContainer trgCont = tu.getTarget(targetLocale);
if ( tu.getSource().hasBeenSegmented() ) {
ISegments trgSegs = null;
if ( trgCont != null ) trgSegs = trgCont.getSegments();
for ( Segment srcSeg : tu.getSourceSegments() ) {
// Skip segments without text
if ( !srcSeg.text.hasText() ) {
continue;
}
// Skip segments with existing candidate if requested
if ( params.getOnlyWhenWithoutCandidate() && hasExistingCandidate(srcSeg, trgSegs) ) {
continue;
}
// Add the segment to the list of source
fragments.add(srcSeg.text);
tuIds.add(tu.getId());
segIds.add(tu.getId()+"\f"+srcSeg.getId());
}
}
else { // Not segmented: Look at container-level
TextFragment srcFrag = tu.getSource().getFirstContent();
if ( !srcFrag.hasText() ) {
continue;
}
// Skip entry with existing candidate if required
if ( params.getOnlyWhenWithoutCandidate() && hasExistingCandidate(trgCont) ) {
continue;
}
// Add the text to the list of source (it may be more than one sentence)
fragments.add(srcFrag);
tuIds.add(tu.getId());
segIds.add(null);
}
}
}
// Nothing to translate
if ( fragments.isEmpty() ) {
return;
}
// Call the translation engine
List> list = conn.batchQuery(fragments);
if ( Util.isEmpty(list) ) {
logger.warn("No translation generated.");
return;
}
// Place the translations
int entryIndex = 0;
for ( Event event : events ) {
// Skip non-text-unit events
if ( !event.isTextUnit() ) continue;
// Skip non-translatable entries
ITextUnit tu = event.getTextUnit();
if ( !tu.isTranslatable() ) continue;
TextContainer trgCont = tu.getTarget(targetLocale);
if ( tu.getSource().hasBeenSegmented() ) {
// Process all the segments for this text unit
for ( Segment srcSeg : tu.getSourceSegments() ) {
if ( !srcSeg.text.hasText() ) {
continue;
}
// There may be no more entries
if ( entryIndex >= segIds.size() ) continue;
// Skip until we reach next segment that was translated
if ( !segIds.get(entryIndex).equals(tu.getId()+"\f"+srcSeg.getId()) ) continue;
// Go through the results for that source
List resList = list.get(entryIndex);
entryIndex++; // For next time
boolean firstMatch = true;
for ( QueryResult res : resList ) {
// Write to TMX if needed
if ( tmxWriter != null ) {
if ( res.getCombinedScore() > 9 ) { // Skip output if score is low (e.g. error)
tmxWriter.writeTU(res.source, res.target, null, attributes);
}
}
// Determine if we fill the target (first match and it is above threshold)
boolean fill = false;
if ( firstMatch && params.getFillTarget() ) {
fill = (res.getCombinedScore() >= params.getFillTargetThreshold());
}
if ( fill || params.getAnnotate() ) {
// Create the target container if needed
if ( trgCont == null ) {
trgCont = tu.createTarget(targetLocale, false, IResource.COPY_SEGMENTATION);
}
Segment trgSeg = trgCont.getSegments().get(srcSeg.id);
if ( trgSeg == null ) {
trgSeg = new Segment(srcSeg.id);
trgCont.getSegments().append(trgSeg);
}
// Annotate if requested
if ( params.getAnnotate() ) {
TextUnitUtil.addAltTranslation(trgSeg,
res.toAltTranslation(srcSeg.text, sourceLocale, targetLocale));
}
// Fill the target if requested
if ( fill ) {
// If not empty we do not fill (no overwriting)
if ( trgSeg.text.isEmpty() ) {
String codedText = res.target.getCodedText();
if ( params.getUseTargetPrefix() ) {
codedText = params.getTargetPrefix() + codedText;
}
trgSeg.text.setCodedText(codedText, res.target.getClonedCodes());
}
}
}
// Next entry will not be the first
firstMatch = false;
}
}
}
else { // Not segmented: work at the container level
if ( !tu.getSource().getFirstContent().hasText() ) {
continue;
}
// There may be no more entries
if ( entryIndex >= tuIds.size() ) continue;
// Skip TUs until we reach the next one that was translated
if ( !tuIds.get(entryIndex).equals(tu.getId()) ) continue;
// Go through the results for that source
List resList = list.get(entryIndex);
entryIndex++; // For next time
boolean firstMatch = true;
for ( QueryResult res : resList ) {
// Write to TMX if needed
if ( tmxWriter != null ) {
tmxWriter.writeTU(res.source, res.target, null, attributes);
}
// Determine if we fill the target (first match and it is above threshold)
boolean fill = false;
if ( firstMatch && params.getFillTarget() ) {
fill = (res.getCombinedScore() >= params.getFillTargetThreshold());
}
if ( fill || params.getAnnotate() ) {
// Create the target container if needed
if ( trgCont == null ) {
trgCont = tu.createTarget(targetLocale, false, IResource.COPY_SEGMENTATION);
}
// Annotate if requested
if ( params.getAnnotate() ) {
TextFragment srcFrag = tu.getSource().getFirstContent();
TextUnitUtil.addAltTranslation(trgCont,
res.toAltTranslation(srcFrag, sourceLocale, targetLocale));
}
// Fill the target if requested
if ( fill ) {
TextFragment trgFrag = trgCont.getFirstContent();
// If not empty we do not fill (no overwriting)
if ( trgFrag.isEmpty() ) {
String codedText = res.target.getCodedText();
if ( params.getUseTargetPrefix() ) {
codedText = params.getTargetPrefix() + codedText;
}
trgFrag.setCodedText(codedText, res.target.getClonedCodes());
}
}
}
// Next entry will not be the first
firstMatch = false;
}
}
}
// We are done: the translations are annotations
// and/or written out in the TMX output
}
private boolean hasExistingCandidate (Segment srcSeg,
ISegments trgSegs)
{
if ( trgSegs == null ) return false; // No target at all
Segment trgSeg = trgSegs.get(srcSeg.getId());
if ( trgSeg == null ) return false; // No target segment
// Do we have the annotation?
AltTranslationsAnnotation ann = trgSeg.getAnnotation(AltTranslationsAnnotation.class);
if ( ann == null ) return false; // No AltTranslationsAnnotation
// Do we have at least one entry
return (ann.getFirst() != null);
}
private boolean hasExistingCandidate (TextContainer frag) {
if ( frag == null ) return false; // No target at all
// Do we have the annotation?
AltTranslationsAnnotation ann = frag.getAnnotation(AltTranslationsAnnotation.class);
if ( ann == null ) return false; // No AltTranslationsAnnotation
// Do we have at least one entry
return (ann.getFirst() != null);
}
/**
* Generates the alternative output used by several Trados Steps
* @param outFilePath path of the new input file.
* @param defaultEncoding default encoding of the new file.
* @param sourceLocale source locale.
* @param targetLocale target locale.
* @param filterConfigId filter configuration id of the new file.
* @return the event created.
*/
static public Event generateAltOutput (String outFilePath,
String defaultEncoding,
LocaleId sourceLocale,
LocaleId targetLocale,
String filterConfigId )
{
List list = new ArrayList<>();
// Change the pipeline parameters for the raw-document-related data
PipelineParameters pp = new PipelineParameters();
RawDocument rawDoc = new RawDocument(new File(outFilePath).toURI(), defaultEncoding, sourceLocale, targetLocale, filterConfigId);
pp.setInputURI(rawDoc.getInputURI());
pp.setOutputURI(rawDoc.getInputURI()); // Use same name as this output for now
pp.setSourceLocale(rawDoc.getSourceLocale());
pp.setTargetLocale(rawDoc.getTargetLocale());
pp.setOutputEncoding(rawDoc.getEncoding()); // Use same as the output document
pp.setInputRawDocument(rawDoc);
pp.setFilterConfigurationId(rawDoc.getFilterConfigId());
pp.setBatchInputCount(1);
// Add the event to the list
list.add(new Event(EventType.PIPELINE_PARAMETERS, pp));
// Add raw-document related events
list.add(new Event(EventType.START_BATCH_ITEM));
list.add(new Event(EventType.RAW_DOCUMENT, rawDoc));
list.add(new Event(EventType.END_BATCH_ITEM));
// Return the list as a multiple-event event
return new Event(EventType.MULTI_EVENT, new MultiEvent(list));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy