net.sf.okapi.steps.paraaligner.ParagraphAlignerStep Maven / Gradle / Ivy
/*===========================================================================
Copyright (C) 2009-2011 by the Okapi Framework contributors
-----------------------------------------------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
===========================================================================*/
package net.sf.okapi.steps.paraaligner;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.incava.diff.LCS;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.exceptions.OkapiBadStepInputException;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.filters.IFilterConfigurationMapper;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.MultiEvent;
import net.sf.okapi.common.resource.PipelineParameters;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.simplifier.ResourceSimplifier;
import net.sf.okapi.steps.gcaligner.AlignmentScorer;
import net.sf.okapi.steps.gcaligner.GaleAndChurch;
/**
* Align paragraphs (TextUnits) between a source and target document. Uses inter-paragraph
* formatting and other heuristics to align paragraphs. TextUnits from this step can be sent the the
* {@link SentenceAlignerStep} for more fine grained alignment. TextUnits should not be
* segmented.
*
* @author HARGRAVEJE
*/
@UsingParameters(Parameters.class)
public class ParagraphAlignerStep extends BasePipelineStep {
// private final Logger LOGGER = LoggerFactory.getLogger(getClass());
private Parameters params;
private IFilter filter = null;
private IFilterConfigurationMapper fcMapper;
private LocaleId targetLocale;
private LocaleId sourceLocale;
private List srcEvents;
private List trgEvents;
private List textUnitEvents;
private RawDocument targetInput = null;
private EventComparator comparator;
private ParagraphAligner paragraphAligner;
ResourceSimplifier sourceSimplifier;
public ParagraphAlignerStep() {
params = new Parameters();
List> scorerList = new LinkedList<>();
scorerList.add(new GaleAndChurch<>());
paragraphAligner = new ParagraphAligner(scorerList);
}
@StepParameterMapping(parameterType = StepParameterType.FILTER_CONFIGURATION_MAPPER)
public void setFilterConfigurationMapper(IFilterConfigurationMapper fcMapper) {
this.fcMapper = fcMapper;
}
@StepParameterMapping(parameterType = StepParameterType.SOURCE_LOCALE)
public void setSourceLocale(LocaleId sourceLocale) {
this.sourceLocale = sourceLocale;
}
@StepParameterMapping(parameterType = StepParameterType.TARGET_LOCALE)
public void setTargetLocale(LocaleId targetLocale) {
this.targetLocale = targetLocale;
}
@StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC)
public void setSecondInput(RawDocument secondInput) {
this.targetInput = secondInput;
}
@Override
public String getName() {
return "Paragraph Alignment";
}
@Override
public String getDescription() {
return "Align paragraphs (text units) between a source and a target document. Only TextUnit events are passed along - all other events are lost";
}
@Override
public Parameters getParameters() {
return params;
}
@Override
public void setParameters(IParameters params) {
this.params = (Parameters) params;
}
@Override
protected Event handleStartBatch(Event event) {
return event;
}
@Override
protected Event handleEndBatch(Event event) {
return event;
}
@Override
protected Event handleStartDocument(Event event) {
if (targetInput != null) {
trgEvents = new ArrayList<>();
initializeFilter();
}
srcEvents = new ArrayList<>();
textUnitEvents = new ArrayList<>();
comparator = new EventComparator();
sourceSimplifier = new ResourceSimplifier(sourceLocale);
return eventIndicatingTargetWasConsumed(event);
}
@Override
protected Event handleEndDocument(Event event) {
srcEvents.addAll(sourceSimplifier.convertToList(event));
// align skeleton chunks
LCS skeletonAlignments = skeletonAlign();
// align paragraphs (TextUnits) between aligned skeleton using G&C
paragraphAlign(skeletonAlignments);
// the diff leverage is over now send the cached events down the
// pipeline as a MULTI_EVENT
// add the end document event so its not eaten
// create a multi event and pass it on to the other steps
textUnitEvents.add(event);
Event multiEvent = new Event(EventType.MULTI_EVENT, new MultiEvent(textUnitEvents));
if (filter != null) {
filter.close();
}
srcEvents.clear();
srcEvents = null;
trgEvents.clear();
trgEvents = null;
return multiEvent;
}
@Override
protected Event handleDocumentPart(final Event event) {
srcEvents.addAll(sourceSimplifier.convertToList(event));
return Event.createNoopEvent();
}
@Override
protected Event handleStartSubDocument(final Event event) {
srcEvents.addAll(sourceSimplifier.convertToList(event));
return Event.createNoopEvent();
}
@Override
protected Event handleEndSubDocument(final Event event) {
srcEvents.addAll(sourceSimplifier.convertToList(event));
return Event.createNoopEvent();
}
@Override
protected Event handleStartGroup(final Event event) {
srcEvents.addAll(sourceSimplifier.convertToList(event));
return Event.createNoopEvent();
}
@Override
protected Event handleEndGroup(final Event event) {
srcEvents.addAll(sourceSimplifier.convertToList(event));
return Event.createNoopEvent();
}
@Override
protected Event handleTextUnit(Event sourceEvent) {
srcEvents.addAll(sourceSimplifier.convertToList(sourceEvent));
return Event.createNoopEvent();
}
private void initializeFilter() {
if (targetInput == null) {
throw new OkapiBadStepInputException("No target document found.");
}
// Initialize the filter to read the translation to compare
filter = fcMapper.createFilter(targetInput.getFilterConfigId(), null);
// Open the second input for this batch item
filter.open(targetInput);
// populate target Event list
filterTarget();
}
private void filterTarget() {
Event event = null;
ResourceSimplifier simplifier = new ResourceSimplifier(targetLocale);
while (filter.hasNext()) {
event = filter.next();
trgEvents.addAll(simplifier.convertToList(event));
}
}
private LCS skeletonAlign() {
LCS diffEvents = null;
if (!params.isUseSkeletonAlignment()) {
// don't use skeleton diffing to anchor alignments, return empty diffEvents
return diffEvents;
}
// diff the two Event lists based on the provided Comparator
// find matching skeleton pairs
diffEvents = new LCS<>(srcEvents, trgEvents, comparator);
return diffEvents;
}
private void paragraphAlign(LCS skeletonAlignments) {
int srcStartMatchIndex = 0;
int trgStartMatchIndex = 0;
int srcEndMatchIndex = srcEvents.size();
int trgEndMatchIndex = trgEvents.size();
// loop through the skeleton matches and align TU's between them
if (skeletonAlignments != null) {
int n = -1;
for (Integer m : skeletonAlignments.getMatches()) {
n++;
if (m == null) continue;
srcEndMatchIndex = n;
trgEndMatchIndex = m;
addAlignedTextUnits(srcEvents.subList(srcStartMatchIndex, srcEndMatchIndex),
trgEvents.subList(trgStartMatchIndex, trgEndMatchIndex));
srcStartMatchIndex = srcEndMatchIndex;
trgStartMatchIndex = trgEndMatchIndex;
}
}
// handle the remaining TU's after the last skeleton match
srcEndMatchIndex = srcEvents.size();
trgEndMatchIndex = trgEvents.size();
addAlignedTextUnits(srcEvents.subList(srcStartMatchIndex, srcEndMatchIndex),
trgEvents.subList(trgStartMatchIndex, trgEndMatchIndex));
}
private void addAlignedTextUnits(List ses, List tes) {
List stus = filterOutNonTextUnit(ses);
List ttus = filterOutNonTextUnit(tes);
if (!stus.isEmpty() && !ttus.isEmpty()) {
textUnitEvents.addAll(textUnitsToEvents(alignTus(stus, ttus)));
}
}
private List filterOutNonTextUnit(List events) {
List tus = new LinkedList<>();
// pull out any ITextUnits into a separate list
for (Event e : events) {
if (e.isTextUnit()) {
tus.add(e.getTextUnit());
}
}
return tus;
}
private List alignTus(List stus, List ttus) {
AlignedParagraphs preAlignedTus = paragraphAligner.align(stus, ttus, sourceLocale, targetLocale, params.isOutputOneToOneMatchesOnly());
List alignedTus = preAlignedTus.align();
return alignedTus;
}
private Event eventIndicatingTargetWasConsumed(Event startDocEvent)
{
List list = new ArrayList<>();
// Change the pipeline parameters for the raw-document-related data
PipelineParameters pp = new PipelineParameters();
pp.setSecondInputRawDocument(null);
// Add the PipelineParameters event to the list
list.add(new Event(EventType.PIPELINE_PARAMETERS, pp));
// Add the original StartDocument event to the list
list.add(startDocEvent);
srcEvents.addAll(sourceSimplifier.convertToList(startDocEvent));
// Return the list as a multiple-event event
// now the StartDocument event in subsequent steps won't try to read the target
return new Event(EventType.MULTI_EVENT, new MultiEvent(list));
}
private List textUnitsToEvents(List tus) {
Event event;
Listevents = new ArrayList<>();
Iterator it = tus.iterator();
while(it.hasNext()) {
event = new Event(EventType.TEXT_UNIT, it.next());
events.add(event);
}
return events;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy