All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.sentencealigner.SentenceAligner Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
/**===========================================================================
 Additional changes Copyright (C) 2009-2011 by the Okapi Framework contributors
 ===========================================================================*/
/*  Copyright 2009 Welocalize, Inc. 
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  
 *  You may obtain a copy of the License at 
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 */

package net.sf.okapi.steps.sentencealigner;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.resource.AlignedPair;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.steps.gcaligner.AlignmentFunction;
import net.sf.okapi.steps.gcaligner.AlignmentScorer;
import net.sf.okapi.steps.gcaligner.DpMatrix;
import net.sf.okapi.steps.gcaligner.DpMatrixCell;
import net.sf.okapi.steps.gcaligner.Penalties;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * SentenceAligner aligns source and target (paragraph) {@link TextUnit}s and returns a list of aligned sentence-based
 * {@link TextUnit} objects.
 */

public class SentenceAligner {
	private final Logger LOGGER = LoggerFactory.getLogger(getClass());
	
	private static final long MAX_CELL_SIZE = 80000L;
	private List> scorerList;
	
	public SentenceAligner(List> scorerList) {
		this.scorerList = scorerList;
	}

	/*
	 * TODO: set value for what we consider low scoring matches 
	 */
	//private static final int LOW_SCORE_THRESHOLD = 0;

	public ITextUnit align(ITextUnit sourceParagraph, ITextUnit targetParagraph, LocaleId srcLocale,
			LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
		return alignWithoutSkeletonAlignment(sourceParagraph, targetParagraph, srcLocale, trgLocale, outputOneTOneMatchesOnly);
	}

	public ITextUnit align(ITextUnit bilingualParagraph, LocaleId srcLocale, LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
		return alignWithoutSkeletonAlignment(bilingualParagraph, srcLocale, trgLocale, outputOneTOneMatchesOnly);
	}

	private ITextUnit alignWithoutSkeletonAlignment(ITextUnit sourceParagraph,
			ITextUnit targetParagraph, LocaleId srcLocale, LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
		AlignmentFunction alignmentFunction = new AlignmentFunction<>(srcLocale,
				trgLocale, scorerList, new Penalties());
		return alignSegments(sourceParagraph, targetParagraph, srcLocale, trgLocale,
				alignmentFunction, outputOneTOneMatchesOnly);
	}

	private ITextUnit alignWithoutSkeletonAlignment(ITextUnit bilingualParagraph, LocaleId srcLocale,
			LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
		AlignmentFunction alignmentFunction = new AlignmentFunction<>(srcLocale,
				trgLocale, scorerList, new Penalties());
		return alignSegments(bilingualParagraph, srcLocale, trgLocale, alignmentFunction, outputOneTOneMatchesOnly);
	}

	private ITextUnit alignSegments(ITextUnit sourceParagraph, ITextUnit targetParagraph,
			LocaleId srcLocale, LocaleId trgLocale, AlignmentFunction alignmentFunction, boolean outputOneTOneMatchesOnly) {

		// To prevent OutOfMemory exception, simply don't perform the
		// alignment for a block with a lot of segments. TEMPORARY FIX
		if (sourceParagraph.getSource().getSegments().count()
				* targetParagraph.getSource().getSegments().count() > MAX_CELL_SIZE) {
			throw new IllegalArgumentException("Too many segments. Can only align "
					+ MAX_CELL_SIZE
					+ ". Where the number equals the source segments times the target segments.");
		}

		DpMatrix matrix = new DpMatrix<>(sourceParagraph.getSource().getSegments().asList(),
				targetParagraph.getSource().getSegments().asList(), alignmentFunction);

		List result = matrix.align();

		// record the result in a list of AlignedPairs
		List alignedPairs = new LinkedList<>();

		String srcTuid = sourceParagraph.getName() == null ? "unknown" : sourceParagraph.getName();				
		Iterator it = result.iterator();
		while (it.hasNext()) {
			DpMatrixCell cell = it.next();
			
			if (outputOneTOneMatchesOnly) {
				if (cell.getState() == DpMatrixCell.MATCH) {
					Segment sourceSegment = matrix.getAlignmentElementX(cell.getXindex());
					Segment targetSegment = matrix.getAlignmentElementY(cell.getYindex());
					alignedPairs.add(new AlignedPair(sourceSegment, targetSegment, trgLocale));
				} 				
				continue;
			}			
			
			if (cell.getState() == DpMatrixCell.DELETED) {
				Segment sourceSegment = matrix.getAlignmentElementX(cell.getXindex());
				alignedPairs.add(new AlignedPair(sourceSegment, null, trgLocale));
				LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						sourceSegment.toString(), srcTuid);
			} else if (cell.getState() == DpMatrixCell.INSERTED) {
				Segment targetSegment = matrix.getAlignmentElementY(cell.getYindex());
				alignedPairs.add(new AlignedPair(null, targetSegment, trgLocale));
				LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						targetSegment.toString(), srcTuid);
			} else if (cell.getState() == DpMatrixCell.MATCH) {
				Segment sourceSegment = matrix.getAlignmentElementX(cell.getXindex());
				Segment targetSegment = matrix.getAlignmentElementY(cell.getYindex());
				alignedPairs.add(new AlignedPair(sourceSegment, targetSegment, trgLocale));
			} else if (cell.getState() == DpMatrixCell.MULTI_MATCH) {
				List sourceSegments = matrix.getAlignmentElementsX(
						cell.getMultiMatchXIndexBegin(), cell.getMultiMatchXIndexEnd());
				List targetSegments = matrix.getAlignmentElementsY(
						cell.getMultiMatchYIndexBegin(), cell.getMultiMatchYIndexEnd());
				alignedPairs.add(new AlignedPair(new LinkedList<>(sourceSegments),
						new LinkedList<>(targetSegments), trgLocale));
				Segment s = null;
				try {
					s = sourceSegments.get(0);
				} catch (IndexOutOfBoundsException e) {
					s = targetSegments.get(0);
				}
				LOGGER.warn("{}\nMulti-Segment Match (TU ID: {}): Non 1-1 match. Please confirm alignment.", s.toString(), srcTuid);
			}
		}
		
		sourceParagraph.getAlignedSegments().align(alignedPairs, trgLocale);
		return sourceParagraph;
	}

	private ITextUnit alignSegments(ITextUnit bilingualParagraph, LocaleId srcLocale,
			LocaleId trgLocale, AlignmentFunction alignmentFunction, boolean outputOneTOneMatchesOnly) {

		// To prevent OutOfMemory exception, simply don't perform the
		// alignment for a block with a lot of segments. TEMPORARY FIX
		if (bilingualParagraph.getSource().getSegments().count()
				* bilingualParagraph.getTarget(trgLocale).getSegments().count() > MAX_CELL_SIZE) {
			throw new IllegalArgumentException("Too many segments. Can only align "
					+ MAX_CELL_SIZE
					+ ". Where the number equals the source segments times the target segments.");
		}

		DpMatrix matrix = new DpMatrix<>(bilingualParagraph.getSource().getSegments().asList(),
				bilingualParagraph.getTarget(trgLocale).getSegments().asList(), alignmentFunction);

		List result = matrix.align();

		// record the result in a list of AlignedPairs
		List alignedPairs = new LinkedList<>();

		String srcTuid = bilingualParagraph.getName() == null ? "unknown" : bilingualParagraph.getName(); 
		Iterator it = result.iterator();
		while (it.hasNext()) {
			DpMatrixCell cell = it.next();
			
			if (outputOneTOneMatchesOnly) {
				if (cell.getState() == DpMatrixCell.MATCH) {
					Segment sourceSegment = matrix.getAlignmentElementX(cell.getXindex());
					Segment targetSegment = matrix.getAlignmentElementY(cell.getYindex());
					alignedPairs.add(new AlignedPair(sourceSegment, targetSegment, trgLocale));
				} 				
				continue;
			}			

			if (cell.getState() == DpMatrixCell.DELETED) {
				Segment sourceSegment = matrix.getAlignmentElementX(cell.getXindex());
				alignedPairs.add(new AlignedPair(sourceSegment, null, trgLocale));
				LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						sourceSegment.toString(), srcTuid);
			} else if (cell.getState() == DpMatrixCell.INSERTED) {
				Segment targetSegment = matrix.getAlignmentElementY(cell.getYindex());
				alignedPairs.add(new AlignedPair(null, targetSegment, trgLocale));
				LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						targetSegment.toString(), srcTuid);
			} else if (cell.getState() == DpMatrixCell.MATCH) {
				Segment sourceSegment = matrix.getAlignmentElementX(cell.getXindex());
				Segment targetSegment = matrix.getAlignmentElementY(cell.getYindex());
				alignedPairs.add(new AlignedPair(sourceSegment, targetSegment, trgLocale));
			} else if (cell.getState() == DpMatrixCell.MULTI_MATCH) {
				List sourceSegments = matrix.getAlignmentElementsX(
						cell.getMultiMatchXIndexBegin(), cell.getMultiMatchXIndexEnd());
				List targetSegments = matrix.getAlignmentElementsY(
						cell.getMultiMatchYIndexBegin(), cell.getMultiMatchYIndexEnd());
				alignedPairs.add(new AlignedPair(new LinkedList<>(sourceSegments),
						new LinkedList<>(targetSegments), trgLocale));
				Segment s = null;
				try {
					s = sourceSegments.get(0);
				} catch (IndexOutOfBoundsException e) {
					s = targetSegments.get(0);
				}
				LOGGER.warn("{}\nMulti-Segment Match (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						s.toString(), srcTuid);
			}
		}

		bilingualParagraph.getAlignedSegments().align(alignedPairs, trgLocale);
		return bilingualParagraph;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy