All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.paraaligner.ParagraphAligner Maven / Gradle / Ivy

/*  Copyright 2009 Welocalize, Inc. 
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  
 *  You may obtain a copy of the License at 
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 */
/*===========================================================================
 Additional changes Copyright (C) 2009-2011 by the Okapi Framework contributors
 ===========================================================================*/

package net.sf.okapi.steps.paraaligner;

import java.util.Iterator;
import java.util.List;

import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.steps.gcaligner.AlignmentFunction;
import net.sf.okapi.steps.gcaligner.AlignmentScorer;
import net.sf.okapi.steps.gcaligner.DpMatrix;
import net.sf.okapi.steps.gcaligner.DpMatrixCell;
import net.sf.okapi.steps.gcaligner.Penalties;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * SentenceAligner aligns source and target (paragraph) {@link TextUnit}s and returns a list of aligned sentence-based
 * {@link TextUnit} objects.
 */

public class ParagraphAligner {
	private final Logger LOGGER = LoggerFactory.getLogger(getClass());
	
	private static final long MAX_CELL_SIZE = 80000L;
	private List> scorerList;
	
	public ParagraphAligner(List> scorerList) {
		this.scorerList = scorerList;
	}

	public AlignedParagraphs align(List sourceParagraphs, List targetParagraphs, LocaleId srcLocale,
			LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
		return alignWithoutSkeletonAlignment(sourceParagraphs, targetParagraphs, srcLocale, trgLocale, outputOneTOneMatchesOnly);
	}

	private AlignedParagraphs alignWithoutSkeletonAlignment(List sourceParagraphs,
			List targetParagraphs, LocaleId srcLocale, LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
		AlignmentFunction alignmentFunction = new AlignmentFunction<>(srcLocale,
				trgLocale, scorerList, new Penalties());
		return alignSegments(sourceParagraphs, targetParagraphs, srcLocale, trgLocale,
				alignmentFunction, outputOneTOneMatchesOnly);
	}

	private AlignedParagraphs alignSegments(List sourceParagraphs, List targetParagraphs,
			LocaleId srcLocale, LocaleId trgLocale, AlignmentFunction alignmentFunction, boolean outputOneTOneMatchesOnly) {

		// To prevent OutOfMemory exception, simply don't perform the
		// alignment for a block with a lot of segments. TEMPORARY FIX
		if (sourceParagraphs.size()
				* targetParagraphs.size() > MAX_CELL_SIZE) {
			throw new IllegalArgumentException("Too many segments. Can only align "
					+ MAX_CELL_SIZE
					+ ". Where the number equals the source segments times the target segments.");
		}

		DpMatrix matrix = new DpMatrix<>(sourceParagraphs, targetParagraphs, alignmentFunction);

		List result = matrix.align();
		AlignedParagraphs alignedParas = new AlignedParagraphs(trgLocale);
		
		Iterator it = result.iterator();
		while (it.hasNext()) {
			DpMatrixCell cell = it.next(); 
			
			if (outputOneTOneMatchesOnly) {
				if (cell.getState() == DpMatrixCell.MATCH) {
					ITextUnit sourcePara = matrix.getAlignmentElementX(cell.getXindex());
					ITextUnit targetPara = matrix.getAlignmentElementY(cell.getYindex());
					alignedParas.addAlignment(sourcePara, targetPara);
				}
				continue;
			}			
			
			if (cell.getState() == DpMatrixCell.DELETED) {
				ITextUnit sourcePara = matrix.getAlignmentElementX(cell.getXindex());				
				alignedParas.addAlignment(sourcePara, null);
				LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						sourcePara.toString(), sourcePara.getName());
			} else if (cell.getState() == DpMatrixCell.INSERTED) {
				ITextUnit targetPara = matrix.getAlignmentElementY(cell.getYindex());
				alignedParas.addAlignment(null, targetPara);
				LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						targetPara.toString(), targetPara.getName());
			} else if (cell.getState() == DpMatrixCell.MATCH) {
				ITextUnit sourcePara = matrix.getAlignmentElementX(cell.getXindex());
				ITextUnit targetPara = matrix.getAlignmentElementY(cell.getYindex());
				alignedParas.addAlignment(sourcePara, targetPara);
			} else if (cell.getState() == DpMatrixCell.MULTI_MATCH) {
				List sourceParas = matrix.getAlignmentElementsX(
						cell.getMultiMatchXIndexBegin(), cell.getMultiMatchXIndexEnd());
				List targetParas = matrix.getAlignmentElementsY(
						cell.getMultiMatchYIndexBegin(), cell.getMultiMatchYIndexEnd());
				alignedParas.addAlignment(sourceParas, targetParas);
				ITextUnit p = null;
				try {
					p = sourceParas.get(0);
				} catch (IndexOutOfBoundsException e) {
					p = targetParas.get(0);
				}
				LOGGER.warn("{}\nMulti-ITextUnit Match (TU ID: {}): Non 1-1 match. Please confirm alignment.",
						p.getSource().toString(), p.getName());
			}
		}
		
		return alignedParas;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy