All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.leveraging.BatchTmLeveragingStep Maven / Gradle / Ivy

/*===========================================================================
  Copyright (C) 2009-2021 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.leveraging;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.annotation.AltTranslation;
import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.query.IQuery;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.MultiEvent;
import net.sf.okapi.common.resource.Property;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.lib.translation.ITMQuery;
import net.sf.okapi.lib.translation.QueryManager;
import net.sf.okapi.steps.diffleverage.DiffMatchAnnotation;

import java.util.LinkedList;
import java.util.List;

public class BatchTmLeveragingStep extends BasePipelineStep {
	private static final int BATCH_LEVERAGE_MAX = 30;

	private final List batchedEvents;
	private int tuEventCount;
	private LocaleId sourceLocale;
	private LocaleId targetLocale;
	private Parameters params;
	private IQuery connector;

	private String rootDir;

	public BatchTmLeveragingStep() {
		params = new Parameters();
		batchedEvents = new LinkedList<>();
	}

	@Override
    @StepParameterMapping(parameterType = StepParameterType.SOURCE_LOCALE)
	public void setSourceLocale(LocaleId sourceLocale) {
		this.sourceLocale = sourceLocale;
	}
	
	public LocaleId getSourceLocale() {
		return sourceLocale;
	}

	@Override
    @StepParameterMapping(parameterType = StepParameterType.TARGET_LOCALE)
	public void setTargetLocale(LocaleId targetLocale) {
		this.targetLocale = targetLocale;
	}

	@Override
    public LocaleId getTargetLocale() {
		return targetLocale;
	}
	
	@StepParameterMapping(parameterType = StepParameterType.ROOT_DIRECTORY)
	public void setRootDirectory(String rootDir) {
		this.rootDir = rootDir;
	}
	
	public String getRootDirectory() {
		return rootDir;
	}

	@Override
	public String getName() {
		return "Simple Batch Leveraging Step (Beta)";
	}

	@Override
	public String getDescription() {
		return "Simple and fast batch leveraging step that delegates to connectors";
	}

	@Override
	public Parameters getParameters() {
		return params;
	}

	@Override
	public void setParameters(IParameters params) {
		this.params = (Parameters) params;
	}

	@Override
	public Event handleEvent(Event event) {
		switch (event.getEventType()) {
		case TEXT_UNIT:
			ITextUnit tu = event.getTextUnit();
			batchedEvents.add(event);
			if (!canLeverageTu(tu)) {
				return Event.createNoopEvent();
			}
			handleTextUnit(event);
			break;
		case START_BATCH_ITEM:
			return handleStartBatchItem(event);
		case END_BATCH_ITEM:
			return handleEndBatchItem(event);
		case START_BATCH:
			return handleStartBatch(event);
		case END_BATCH:
			return handleEndBatch(event);
		case END_DOCUMENT:
			return handleEndDocument(event);
		case START_DOCUMENT:
			return handleStartDocument(event);
		default:
			batchedEvents.add(event);
			break;
		}
		return Event.createNoopEvent();
	}

	@Override
	protected Event handleTextUnit(Event event) {
		// if we get here then it really is a TU we care to leverage
		tuEventCount++;
		if (tuEventCount >= BATCH_LEVERAGE_MAX) {
			tuEventCount = 0;
			batchLeverage();
			MultiEvent me = new MultiEvent();
			for (Event e : batchedEvents) {
				me.addEvent(e);
			}
			batchedEvents.clear();
			return new Event(EventType.MULTI_EVENT, me);
		}

		return Event.createNoopEvent();
	}

	@Override
	protected Event handleStartBatch(Event event) {
		tuEventCount = 0;

		try {
			connector = (IQuery) Class.forName(params.getResourceClassName()).newInstance();
		} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
			throw new OkapiException("Error creating connector.", e);
		}

		IParameters connectorParams = connector.getParameters();
		if (connectorParams != null) { // Set the parameters only if the connector takes them
			connectorParams.fromString(params.getResourceParameters());
		}

		connector.setRootDirectory(rootDir); // Before open()
		connector.setParameters(connectorParams);
		connector.open();
		if ((sourceLocale != null) && (targetLocale != null)) {
			connector.setLanguages(sourceLocale, targetLocale);
		}

		if ( connector instanceof ITMQuery ) {
			((ITMQuery)connector).setThreshold(params.getThreshold());
			((ITMQuery)connector).setMaximumHits(5);
		}

		return event;
	}

	@Override
	protected Event handleEndDocument(Event event) {
		tuEventCount = 0;

		// leverage any remaining batched TextUnits for this document
		if (!batchedEvents.isEmpty()) {
			batchLeverage();
			MultiEvent me = new MultiEvent();
			for (Event e : batchedEvents) {
				me.addEvent(e);
			}
			batchedEvents.clear();

			// add END DOCUMENT event
			me.addEvent(event);
			return new Event(EventType.MULTI_EVENT, me);
		}

		return event;
	}

	private boolean canLeverageTu(ITextUnit tu) {
		// Do not leverage non-translatable entries
		if (!tu.isTranslatable()) {
			return false;
		}

		boolean approved = false;
		Property prop = tu.getTargetProperty(targetLocale, Property.APPROVED);
		if (prop != null) {
			if ("yes".equals(prop.getValue())) {
                approved = true;
            }
		}

		// Do not leverage entries without text
		if ( !tu.getSource().hasText() ) {
			return false;
		}
		
		// Do not leverage pre-approved entries
		if (approved) {
			return false;
		}

		// Do not leverage if has been Diff Leveraged
		if (wasDiffLeveraged(tu)) {
			return false;
		}

		return true;
	}

	private void batchLeverage() {
		List tus = new LinkedList<>();
		for (Event e : batchedEvents) {
			if (e.getEventType() == EventType.TEXT_UNIT) {
				ITextUnit tu = e.getTextUnit();
				if (canLeverageTu(tu)) {
					tus.add(e.getTextUnit());
				}
			}
		}
		
		if (tus.isEmpty()) {
			return;
		}
		
		connector.batchLeverage(tus);

		// now copy any matches above our threshold
		if ( params.getFillTarget() ) {
			for (ITextUnit tu : tus) {
				
				// Get and check the target container
				TextContainer tc = tu.getTarget(targetLocale);
				if ( tc == null ) {
                    continue;
                }

				ISegments srcSegs = tu.getSource().getSegments();
				ISegments trgSegs = tc.getSegments();
				
				// Check for entries without text
				if ( params.getCopySourceOnNoText() ) {
					for ( Segment srcSeg : srcSegs ) {
						if ( !srcSeg.text.hasText(false) ) {
							Segment trgSeg = trgSegs.get(srcSeg.id);
							if ( trgSeg != null ) {
								trgSeg.text = srcSeg.text.clone();
							}
						}
					}
				}

				AltTranslationsAnnotation ata;
				if ( tu.getSource().hasBeenSegmented() ) {
					for ( Segment srcSeg : srcSegs ) {
						Segment trgSeg = trgSegs.get(srcSeg.id);
						ata = trgSeg.getAnnotation(AltTranslationsAnnotation.class);
						if (ata != null) {
							AltTranslation at = ata.getFirst(); // First should be best
							if (at.getCombinedScore() >= params.getFillTargetThreshold()) {
								if ( QueryManager.canLeverage(srcSeg, trgSeg, 
										params.getFillIfTargetIsEmpty(), params.getFillIfTargetIsSameAsSource()) )
								{
									TextFragment tf = new TextFragment(
										at.getTarget().getCodedText(), 
										at.getTarget().getFirstContent().getClonedCodes());
									trgSeg.text = tf;
								}
							}
						}
					}
				}
				else {
					ata = tu.getTarget(targetLocale).getAnnotation(AltTranslationsAnnotation.class);
					if (ata != null) {
						AltTranslation at = ata.getFirst(); // first should be best
						if (at.getCombinedScore() >= params.getFillTargetThreshold()) {
							if ( QueryManager.canLeverage(tu.getSource(), tc,
									params.getFillIfTargetIsEmpty(), params.getFillIfTargetIsSameAsSource()) )
							{
								tu.setTargetContent(targetLocale, at.getTarget().getFirstContent());
							}
						}
					}
				}
			}
		}
	}

	private boolean wasDiffLeveraged(ITextUnit tu) {
		if (tu.getTarget(targetLocale) == null) {
			return false;
		}

		if (tu.getTarget(targetLocale).getAnnotation(DiffMatchAnnotation.class) == null) {
			return false;
		}

		return true;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy