All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.diffleverage.DiffLeverageStep Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
/*===========================================================================
  Copyright (C) 2010-2011 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.diffleverage;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.IResource;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.annotation.AltTranslation;
import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
import net.sf.okapi.common.annotation.Annotations;
import net.sf.okapi.common.exceptions.OkapiBadStepInputException;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.filters.IFilterConfigurationMapper;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.query.MatchType;
import net.sf.okapi.common.resource.CodeMatchStrategy;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.MultiEvent;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragmentUtil;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.lib.search.lucene.analysis.NgramAnalyzer;
import net.sf.okapi.lib.search.lucene.scorer.Util;
import org.apache.lucene.analysis.Analyzer;
import org.incava.diff.LCS;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;

/**
 * Contextually match source "paragraphs" (full content of the TextUnit source) between two documents using a standard diff algorithm
 * (http://en.wikipedia.org/wiki/Diff). The result is a new document with the translations from the old document copied
 * into it. This allows translations between different document versions to be preserved while still maintaining the
 * newer source document modifications.
 *
 * 

Adds these {@link Annotations}: *
  • {@link AltTranslationsAnnotation} on the target container. *
  • {@link DiffMatchAnnotation} on the target container (only applied if diffOnly is true)
    * * @author HARGRAVEJE * */ @UsingParameters(Parameters.class) public class DiffLeverageStep extends BasePipelineStep { private static final int NGRAM_SIZE = 3; private Parameters params; private IFilterConfigurationMapper fcMapper; private RawDocument oldSource; private RawDocument oldTarget; private List newTextUnits; private List oldTextUnits; private List newDocumentEvents; private LocaleId sourceLocale; private LocaleId targetLocale; private boolean done = true; private Comparator sourceComparator; private Analyzer triGramAnalyzer; // Trigram Analyzer to be used for FM-Score calculation public DiffLeverageStep() { params = new Parameters(); } /** * * @param fcMapper */ @StepParameterMapping(parameterType = StepParameterType.FILTER_CONFIGURATION_MAPPER) public void setFilterConfigurationMapper(final IFilterConfigurationMapper fcMapper) { this.fcMapper = fcMapper; } /** * * @param sourceLocale */ @StepParameterMapping(parameterType = StepParameterType.SOURCE_LOCALE) public void setSourceLocale(final LocaleId sourceLocale) { this.sourceLocale = sourceLocale; } /** * Target locale. * @param targetLocale */ @StepParameterMapping(parameterType = StepParameterType.TARGET_LOCALE) public void setTargetLocale(final LocaleId targetLocale) { this.targetLocale = targetLocale; } /** * This is the old document (previously translated) * * @param secondInput */ @StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC) public void setSecondInput(final RawDocument secondInput) { oldSource = secondInput; } /** * If set this is the old target that will be paragraph aligned with the old source. * @param tertiaryInput */ @StepParameterMapping(parameterType = StepParameterType.THIRD_INPUT_RAWDOC) public void setTertiaryInput (RawDocument tertiaryInput) { oldTarget = tertiaryInput; } @Override public String getDescription() { return "Compare two source documents (i.e., different versions) and " + "copy the old target content when we find a match. Can be a monolingual " + "and bi-lingual input or three monolingual inputs. Paragraphs (TextUnits) " + "must align in all cases"; } @Override public String getName() { return "Diff Leverage"; } @Override public Parameters getParameters() { return params; } @Override public void setParameters(IParameters params) { this.params = (Parameters) params; } @Override protected Event handleStartBatch(final Event event) { done = true; if (params.getFuzzyThreshold() >= 100) { // exact match sourceComparator = new TextUnitComparator(params.isCodesensitive()); } else { // fuzzy match triGramAnalyzer = new NgramAnalyzer(NGRAM_SIZE); sourceComparator = new FuzzyTextUnitComparator(params.isCodesensitive(), params .getFuzzyThreshold(), sourceLocale); } return event; } @Override protected Event handleEndBatch(final Event event) { return event; } @Override protected Event handleRawDocument(final Event event) { throw new OkapiBadStepInputException( "Encountered a RAW_DOCUMENT event. Expected a filtered event stream."); } @Override protected Event handleStartDocument(final Event event) { // test if we have an alignment at the document level if (oldSource != null) { done = false; // Initialize buffers for a new document newTextUnits = new ArrayList<>(); oldTextUnits = new ArrayList<>(); newDocumentEvents = new LinkedList<>(); // open of the secondary input file (this is our old document) getOldDocumentTextUnits(); } return event; } @Override protected Event handleEndDocument(final Event event) { done = true; if (oldSource != null) { // diff and leverage (copy target segments) the old and new lists of TextUnits diffLeverage(); // the diff leverage is over now send the cached events down the // pipeline as a MULTI_EVENT // add the end document event so its not eaten newDocumentEvents.add(event); // create a multi event and pass it on to the other steps Event multi_event = new Event(EventType.MULTI_EVENT, new MultiEvent(newDocumentEvents)); // help java gc newTextUnits = null; oldTextUnits = null; newDocumentEvents = null; return multi_event; } else { return event; } } @Override protected Event handleStartSubDocument(final Event event) { if (oldSource != null) { newDocumentEvents.add(event); return Event.createNoopEvent(); } else { return event; } } @Override protected Event handleEndSubDocument(final Event event) { if (oldSource != null) { newDocumentEvents.add(event); return Event.createNoopEvent(); } else { return event; } } @Override protected Event handleStartGroup(final Event event) { if (oldSource != null) { newDocumentEvents.add(event); return Event.createNoopEvent(); } else { return event; } } @Override protected Event handleEndGroup(final Event event) { if (oldSource != null) { newDocumentEvents.add(event); return Event.createNoopEvent(); } else { return event; } } @Override protected Event handleTextUnit(final Event event) { if (event.getTextUnit().getSource().hasBeenSegmented()) { throw new OkapiBadStepInputException("DiffLeverageStep only aligns unsegmented TextUnits"); } if (oldSource != null) { newTextUnits.add(event.getTextUnit()); newDocumentEvents.add(event); return Event.createNoopEvent(); } else { return event; } } @Override protected Event handleDocumentPart(final Event event) { if (oldSource != null) { newDocumentEvents.add(event); return Event.createNoopEvent(); } else { return event; } } @Override public boolean isDone() { return done; } private void getOldDocumentTextUnits() { @SuppressWarnings("resource") IFilter trgFilter = null; // Initialize the filter to read the translation to compare try (IFilter srcFilter = fcMapper.createFilter(oldSource.getFilterConfigId(), null)) { if (oldTarget != null) { trgFilter = fcMapper.createFilter(oldSource.getFilterConfigId(), null); // Open the tertiary input for this batch item (old target) trgFilter.open(oldTarget); } // Open the second input for this batch item (old source) srcFilter.open(oldSource); while (srcFilter.hasNext()) { final Event event = srcFilter.next(); if (event.getEventType() == EventType.TEXT_UNIT) { ITextUnit tu = event.getTextUnit(); if (oldTarget != null) { Event e = synchronize(trgFilter, EventType.TEXT_UNIT); tu.setTarget(targetLocale, e.getTextUnit().getSource()); } oldTextUnits.add(tu); } } } finally { if (trgFilter != null) { trgFilter.close(); } } } private Event synchronize(IFilter filter, EventType untilType) { boolean found = false; Event event = null; while (!found && filter.hasNext()) { event = filter.next(); found = (event.getEventType() == untilType); if (event.isTextUnit()) { if (event.getTextUnit().getSource().hasBeenSegmented()) { throw new OkapiBadStepInputException("DiffLeverageStep only aligns unsegmented TextUnits"); } } } if (!found) { throw new OkapiException( "Different number of source or target TextUnits. " + "The source and target documents are not paragraph aligned."); } return event; } private void diffLeverage() { LCS diffTextUnits = new LCS<>(oldTextUnits, newTextUnits, sourceComparator); List matches = diffTextUnits.getMatches(); // loop through the matches and copy over the old target to the new TextUnit int n = -1; for (Integer m : matches) { n++; if (m == null) continue; ITextUnit oldTu = oldTextUnits.get(n); ITextUnit newTu = newTextUnits.get(m); int score = 100; // copy the old translation to the new TextUnit TextContainer otc = null; if ((otc = oldTu.getTarget(targetLocale)) != null) { // only copy the old target if diffOnly is false if (!params.isDiffOnly()) { if (params.getFuzzyThreshold() < 100) { score = (int) Util.calculateNgramDiceCoefficient( oldTu.getSource().getFirstContent().toString(), newTu.getSource().getFirstContent().toString(), triGramAnalyzer); } // We force the source to be a paragraph!! We use getUnSegmentedContentCopy // to make sure we get *all* TextParts (just in case segmentation has been applied // or somehow extra TextParts were added in an external process) // Copy codes from source so that leveraged target matches the source TextFragmentUtil.alignAndCopyCodeMetadata(newTu.getSource().getFirstContent(), otc.getFirstContent(), true, true, CodeMatchStrategy.STRICT); if (params.isCopyToTarget()) { newTu.setTarget(targetLocale, otc); } // make an AltTranslation and attach to the target container AltTranslation alt = new AltTranslation(sourceLocale, targetLocale, newTu.getSource().getUnSegmentedContentCopy(), oldTu.getSource().getUnSegmentedContentCopy(), otc.getUnSegmentedContentCopy(), params.getFuzzyThreshold() >= 100 ? MatchType.EXACT_PREVIOUS_VERSION : MatchType.FUZZY_PREVIOUS_VERSION, score, getName()); // add the annotation to the target container since we are diffing paragraphs only // we may need to create the target if it doesn't exist TextContainer ntc = newTu.createTarget(targetLocale, false, IResource.COPY_PROPERTIES); AltTranslationsAnnotation alta = TextUnitUtil.addAltTranslation(ntc, alt); // resort AltTranslation in case we already had some in the list alta.sort(); } // set the DiffLeverageAnnotation // we may need to create the target if it doesn't exist TextContainer tc = newTu.createTarget(targetLocale, false, IResource.COPY_PROPERTIES); tc.setAnnotation(new DiffMatchAnnotation()); } } } }




  • © 2015 - 2025 Weber Informatics LLC | Privacy Policy