
net.sf.okapi.steps.rainbowkit.postprocess.Merger Maven / Gradle / Ivy
/*===========================================================================
Copyright (C) 2011-2014 by the Okapi Framework contributors
-----------------------------------------------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
===========================================================================*/
package net.sf.okapi.steps.rainbowkit.postprocess;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.IResource;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.MimeTypeMapper;
import net.sf.okapi.common.Range;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.exceptions.OkapiBadFilterInputException;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.filters.IFilterConfigurationMapper;
import net.sf.okapi.common.filterwriter.IFilterWriter;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.MultiEvent;
import net.sf.okapi.common.resource.PipelineParameters;
import net.sf.okapi.common.resource.Property;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.filters.rainbowkit.Manifest;
import net.sf.okapi.filters.rainbowkit.MergingInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Merger {
private final Logger logger = LoggerFactory.getLogger(getClass());
private IFilter filter;
private IFilterWriter writer;
private Manifest manifest;
private LocaleId trgLoc;
private IFilterConfigurationMapper fcMapper;
private boolean skipEmptySourceEntries;
private boolean useSource;
private boolean preserveSegmentation;
private boolean forceSegmentationInMerge;
private boolean returnRawDocument;
private String overrideOutputPath;
private RawDocument rawDoc;
private boolean useSubDoc;
private int errorCount;
/**
* Creates a Merger object.
* @param manifest The manifest to process.
* @param fcMapper the filter mapper to use.
* @param preserveSegmentation true to preserve the segmentation after the merge is done.
* @param forcedTargetLocale null to use the target locale in the manifest, otherwise: the target locale to merge.
* @param returnRawDocument true to return a raw document rather than filter events.
* @param overrideOutputPath path of the output path to use instead of the default one, use null to use the default.
*/
public Merger (Manifest manifest,
IFilterConfigurationMapper fcMapper,
boolean preserveSegmentation,
LocaleId forcedTargetLocale,
boolean returnRawDocument,
String overrideOutputPath)
{
this.fcMapper = fcMapper;
this.manifest = manifest;
if ( forcedTargetLocale != null ) {
trgLoc = forcedTargetLocale;
}
else {
trgLoc = manifest.getTargetLocale();
}
this.preserveSegmentation = preserveSegmentation;
this.returnRawDocument = returnRawDocument;
this.overrideOutputPath = overrideOutputPath;
}
public void close () {
if ( writer != null ) {
writer.close();
writer = null;
}
if ( filter != null ) {
filter.close();
filter = null;
}
}
public Event handleEvent (Event event) {
switch ( event.getEventType() ) {
case TEXT_UNIT:
processTextUnit(event);
if ( returnRawDocument ) {
event = Event.createNoopEvent();
}
break;
case START_SUBDOCUMENT:
if ( returnRawDocument ) {
useSubDoc = true;
event = Event.createNoopEvent();
}
break;
case END_DOCUMENT:
flushFilterEvents();
close();
if ( returnRawDocument && !useSubDoc ) {
event = createMultiEvent();
}
break;
default:
if ( returnRawDocument ) {
event = Event.createNoopEvent();
}
}
return event;
}
private void processStartDocument (StartDocument sd) {
if ( sd.getMimeType().equals(MimeTypeMapper.XLIFF_MIME_TYPE) ) {
net.sf.okapi.filters.xliff.Parameters prm = (net.sf.okapi.filters.xliff.Parameters)sd.getFilterParameters();
forceSegmentationInMerge = (prm.getOutputSegmentationType()
== net.sf.okapi.filters.xliff.Parameters.SegmentationType.SEGMENTED);
}
else {
forceSegmentationInMerge = false;
}
}
private Event createMultiEvent () {
List list = new ArrayList();
// Change the pipeline parameters for the raw-document-related data
PipelineParameters pp = new PipelineParameters();
pp.setOutputURI(rawDoc.getInputURI()); // Use same name as this output for now
pp.setSourceLocale(rawDoc.getSourceLocale());
pp.setTargetLocale(rawDoc.getTargetLocale());
pp.setOutputEncoding(rawDoc.getEncoding()); // Use same as the output document
pp.setInputRawDocument(rawDoc);
// Add the event to the list
list.add(new Event(EventType.PIPELINE_PARAMETERS, pp));
// Add raw-document related events
list.add(new Event(EventType.RAW_DOCUMENT, rawDoc));
// Return the list as a multiple-event event
return new Event(EventType.MULTI_EVENT, new MultiEvent(list));
}
public Event startMerging (MergingInfo info,
Event event)
{
errorCount = 0;
useSubDoc = false;
logger.info("Merging: {}", info.getRelativeInputPath());
// Create the filter for this original file
filter = fcMapper.createFilter(info.getFilterId(), filter);
if ( filter == null ) {
throw new OkapiBadFilterInputException(String.format("Filter cannot be created (%s).", info.getFilterId()));
}
IParameters fprm = filter.getParameters();
if ( fprm != null ) {
fprm.fromString(info.getFilterParameters());
}
File file = new File(manifest.getTempOriginalDirectory() + info.getRelativeInputPath());
RawDocument rd = new RawDocument(file.toURI(), info.getInputEncoding(),
manifest.getSourceLocale(), trgLoc);
filter.open(rd);
writer = filter.createFilterWriter();
writer.setOptions(trgLoc, info.getTargetEncoding());
String outPath = this.getOutputPath(info);
writer.setOutput(outPath);
// Skip entries with empty source for PO
skipEmptySourceEntries = ( info.getExtractionType().equals(Manifest.EXTRACTIONTYPE_PO)
|| info.getExtractionType().equals(Manifest.EXTRACTIONTYPE_TRANSIFEX)
|| info.getExtractionType().equals(Manifest.EXTRACTIONTYPE_TABLE) );
// Use the source of the input as the translation for XINI, etc.
useSource = info.getExtractionType().equals(Manifest.EXTRACTIONTYPE_ONTRAM);
// Process the start document in the document we just open
Event internalEvent = null;
if ( filter.hasNext() ) {
// Should be the start-document event
internalEvent = filter.next();
}
if (( internalEvent == null ) || ( internalEvent.getEventType() != EventType.START_DOCUMENT )) {
errorCount++;
logger.error("The start document event is missing when parsing the original file.");
}
else {
writer.handleEvent(internalEvent);
processStartDocument(internalEvent.getStartDocument());
}
// Compute what event to return
if ( returnRawDocument ) {
if ( event.getStartDocument().isMultilingual() ) {
rawDoc = new RawDocument(new File(outPath).toURI(), info.getTargetEncoding(),
manifest.getSourceLocale(), manifest.getTargetLocale());
}
else {
// Otherwise: the previous target is now the source (and still the target)
rawDoc = new RawDocument(new File(outPath).toURI(), info.getTargetEncoding(),
manifest.getTargetLocale(), manifest.getTargetLocale());
}
event = Event.createNoopEvent();
}
else {
event = internalEvent;
}
return event;
}
private String getOutputPath (MergingInfo info) {
if ( Util.isEmpty(this.overrideOutputPath) ) {
return manifest.getMergeDirectory() + info.getRelativeTargetPath();
}
else {
return Util.ensureSeparator(overrideOutputPath, false) + info.getRelativeTargetPath();
}
}
private void flushFilterEvents () {
// Finish to go through the original file
while ( filter.hasNext() ) {
writer.handleEvent(filter.next());
}
writer.close();
}
private void processTextUnit (Event event) {
// Get the unit from the translation file
ITextUnit traTu = event.getTextUnit();
// If it's not translatable:
if ( !traTu.isTranslatable() ) {
return; // Do nothing: the original will be handled by processUntilTextUnit()
}
// search for the corresponding event in the original
Event oriEvent = processUntilTextUnit();
if ( oriEvent == null ) {
errorCount++;
logger.error("No corresponding text unit for id='{}' in the original file.", traTu.getId());
return;
}
// Get the actual text unit object of the original
ITextUnit oriTu = oriEvent.getTextUnit();
// Check the IDs
if ( !traTu.getId().equals(oriTu.getId()) ) {
errorCount++;
logger.error("De-synchronized files: translated TU id='{}', Original TU id='{}'.",
traTu.getId(), oriTu.getId());
return;
}
// Check if we have a translation
TextContainer trgTraCont;
if ( useSource ) trgTraCont = traTu.getSource();
else trgTraCont = traTu.getTarget(trgLoc);
if ( trgTraCont == null ) {
if ( oriTu.getSource().hasText() ) {
// Warn only if there source is not empty
logger.warn("No translation found for TU id='{}'. Using source instead.", traTu.getId());
}
writer.handleEvent(oriEvent); // Use the source
return;
}
// Process the "approved" property
boolean isTransApproved = false;
Property traProp;
if ( useSource ) traProp = traTu.getSourceProperty(Property.APPROVED);
else traProp = traTu.getTargetProperty(trgLoc, Property.APPROVED);
if ( traProp != null ) {
isTransApproved = traProp.getValue().equals("yes");
}
if ( !isTransApproved && manifest.getUseApprovedOnly() ) {
// Not approved: use the source
logger.warn("Item id='{}': Target is not approved. Using source instead.", traTu.getId());
writer.handleEvent(oriEvent); // Use the source
return;
}
// Do we need to preserve the segmentation for merging (e.g. TTX case)
boolean mergeAsSegments = false;
if ( oriTu.getMimeType() != null ) {
if ( oriTu.getMimeType().equals(MimeTypeMapper.TTX_MIME_TYPE)
|| oriTu.getMimeType().equals(MimeTypeMapper.XLIFF_MIME_TYPE) ) {
mergeAsSegments = true;
}
}
// Set the container for the source
TextContainer srcOriCont = oriTu.getSource();
TextContainer srcTraCont = traTu.getSource();
if ( forceSegmentationInMerge ) {
// Use the source of the target to get the proper segmentations
if ( !srcOriCont.getUnSegmentedContentCopy().getCodedText().equals(
srcTraCont.getUnSegmentedContentCopy().getCodedText()) ) {
logger.warn("Item id='{}': Original source and source in the translated file are different.\n"
+ "Cannot use the source of the translation as the new segmented source.",
traTu.getId());
}
}
// If we do not need to merge segments then we must join all for the merge
// We also remember the ranges to set them back after merging
List srcRanges = null;
List trgRanges = null;
// Merge the segments together for the code transfer
// This allows to move codes anywhere in the text unit, not just each part.
// We do remember the ranges because some formats will required to be merged by segments
if ( !srcOriCont.contentIsOneSegment() ) {
if ( mergeAsSegments ) {
// Make sure we remember the source segment if we merge as segments later
// Otherwise the segment id of the source and target will defer
srcRanges = srcOriCont.getSegments().getRanges();
}
srcOriCont.joinAll();
}
if ( forceSegmentationInMerge ) {
// Get the source segmentation from the translated file if it's
// a forced segmentation
if ( !srcTraCont.contentIsOneSegment() ) {
srcRanges = srcTraCont.getSegments().getRanges();
}
}
else {
// Else: take from the original source
if ( !srcOriCont.contentIsOneSegment() ) {
srcRanges = srcOriCont.getSegments().getRanges();
}
}
if ( !trgTraCont.contentIsOneSegment() ) {
trgRanges = trgTraCont.getSegments().getRanges();
trgTraCont.joinAll();
}
// Perform the transfer of the inline codes
// At this point we have a single segment/part
TextUnitUtil.copySrcCodeDataToMatchingTrgCodes(srcOriCont.getFirstContent(),
trgTraCont.getFirstContent(), true, true, null, oriTu);
// Resegment for the special formats
if ( mergeAsSegments ) {
if ( srcRanges != null ) {
srcOriCont.getSegments().create(srcRanges, true);
}
if ( trgRanges != null ) {
trgTraCont.getSegments().create(trgRanges, true);
}
}
// Check if the target has more segments
if ( srcOriCont.getSegments().count() < trgTraCont.getSegments().count() ) {
logger.warn("Item id='{}': There is at least one extra segment in the translation file.\n"
+ "Extra segments are not merged into the translated output.",
traTu.getId());
}
// Assign the translated target to the target text unit
oriTu.setTarget(trgLoc, trgTraCont);
// Update/add the 'approved' flag of the entry to merge if available
// (for example to remove the fuzzy flag in POs or set the approved attribute in XLIFF)
if ( manifest.getUpdateApprovedFlag() ) {
Property oriProp = oriTu.createTargetProperty(trgLoc, Property.APPROVED, false, IResource.CREATE_EMPTY);
if ( traProp != null ) {
oriProp.setValue(traProp.getValue());
}
else {
oriProp.setValue("yes");
}
}
// Output the translation
writer.handleEvent(oriEvent);
// Set back the segmentation if we modified it for the merge
// (Already done if it's a special format)
if ( preserveSegmentation && !mergeAsSegments ) {
if ( srcRanges != null ) {
srcOriCont.getSegments().create(srcRanges, true);
}
if ( trgRanges != null ) {
trgTraCont.getSegments().create(trgRanges, true);
}
}
}
/**
* Get events in the original document until the next text unit.
* Any event before is passed to the writer.
* @return the event of the next text unit, or null if no next text unit is found.
*/
private Event processUntilTextUnit () {
while ( filter.hasNext() ) {
Event event = filter.next();
if ( event.getEventType() == EventType.TEXT_UNIT ) {
ITextUnit tu = event.getTextUnit();
if ( !tu.isTranslatable() ) {
// Do not merge the translation for non-translatable
writer.handleEvent(event);
continue;
}
if ( skipEmptySourceEntries && tu.isEmpty() ) {
// For some types of package: Do not merge the translation for non-translatable
writer.handleEvent(event);
continue;
}
return event;
}
// Else: write out the event
writer.handleEvent(event);
}
// This text unit is extra in the translated file
return null;
}
/**
* Gets the number of errors since the last call to {@link #startMerging(MergingInfo, Event)}.
* @return the number of errors.
*/
public int getErrorCount () {
return errorCount;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy