All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.paraaligner.AlignedParagraphs Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
package net.sf.okapi.steps.paraaligner;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;

class AlignedParagraphs {
	private List> sourceParas;
	private List> targetParas;
	private List alignedParas;
	private LocaleId targetLocale;

	public AlignedParagraphs(LocaleId targetLocale) {
		this.sourceParas = new LinkedList<>();
		this.targetParas = new LinkedList<>();
		this.alignedParas = new LinkedList<>();
		this.targetLocale = targetLocale;
	}

	public void addAlignment(ITextUnit srcTu, ITextUnit trgTu) {
		List srcParas = new LinkedList<>();
		if (srcTu != null) {
			srcParas.add(srcTu);
		}
		List trgParas = new LinkedList<>();
		if (trgTu != null) {
			trgParas.add(trgTu);
		}
		sourceParas.add(srcParas);
		targetParas.add(trgParas);
	}

	public void addAlignment(List srcTus, List trgTus) {
		sourceParas.add(srcTus);
		targetParas.add(trgTus);
	}

	public List align() {
		// source and target lists are guaranteed to have the same number of elements, though some will be null
		for (List stus : sourceParas) {
			ITextUnit stu = null;
			List ttus = targetParas.remove(0);
			if (stus != null) {
				stu = combineTextUnits(stus);
				if (ttus != null) {
					stu = addTargetTextUnitAsTarget(stu, combineTextUnits(ttus), targetLocale);
				}
			} else {
				if (ttus != null) {
					// source is null, use target tu (target content is source in this TU)
					ITextUnit tempTarget = combineTextUnits(ttus);
					tempTarget.createTarget(targetLocale, true, ITextUnit.COPY_ALL);	
					// remove the source
					tempTarget.setSource(null);
					stu = tempTarget;
				}
			}
			alignedParas.add(stu);
		}
		
		return alignedParas;
	}

	private ITextUnit combineTextUnits(List tus) {		
		Iterator its = tus.iterator();
		ITextUnit tuNew = its.next().clone();
		TextContainer tcAligned = tuNew.getSource();
		while (its.hasNext()) {
			ITextUnit tu2combin = its.next();
			TextContainer tcSource = tu2combin.getSource();
			tcSource.joinAll(); // joins all data parts and segments into one
			ISegments segs2add = tcSource.getSegments();
			Iterator itSeg = segs2add.iterator();
			while (itSeg.hasNext()) {
				String cLast;
				String sCurNoTrim = tcAligned.toString();
				String sCur = sCurNoTrim.trim();
				int lonny = sCur.length();
				if (lonny > 0) {
					cLast = sCur.substring(lonny - 1);
					if (sCur.equals(sCurNoTrim)) {
						if (cLast.equals("!") || cLast.equals(".") || cLast.equals("?")) {
							// append two spaces so previous punc will be a break point
							tcAligned.append(" "); 
						}
							
						else {
							// didn't end with whitespace, so force a sentence break
							tcAligned.append(". "); 
						}
					}
				}
				
				// this combines content from next text unit
				tcAligned.append(itSeg.next().getContent()); 
			}
		}
		tcAligned.joinAll(); // join new parts together as one
		return tuNew;
	}

	private ITextUnit addTargetTextUnitAsTarget(ITextUnit tuSource, ITextUnit tuTarget, LocaleId targetLocale) {
		tuSource.setTarget(targetLocale, tuTarget.getSource());
		return tuSource;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy