All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.textmodification.TextModificationStep Maven / Gradle / Ivy

/*===========================================================================
  Copyright (C) 2009-2012 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.textmodification;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.IResource;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextPart;
import net.sf.okapi.common.resource.TextUnitUtil;

@UsingParameters(Parameters.class)
public class TextModificationStep extends BasePipelineStep {

	private static final char STARTSEG = '[';
	private static final char ENDSEG = ']';
	private static final int SCRIPT_MAX = 3;

	private Parameters params;
	private LocaleId targetLocale;
	private final String[] oldChars;
	private final String[] newChars;

	public TextModificationStep () {
		params = new Parameters();
		
		oldChars = new String[4];
		newChars = new String[4];
		// Latin extended characters
		oldChars[0] = "AaBbCcDdEeFfGgHhIiJjKkLlNnOoPpQqRrSsTtUuWwYyZz";
		newChars[0] = "\u00c0\u00e0\u00df\u0180\u0106\u0107\u010e\u010f\u0112\u0113\u0191\u0192\u011c\u011d\u0124\u0125"
			+ "\u0128\u0129\u0135\u0134\u0136\u0137\u0139\u013a\u0143\u0144\u014c\u014d\u01a4\u01a5\u01ea\u01eb\u0154\u0155"
			+ "\u015a\u015b\u0162\u0163\u0168\u0169\u0174\u0175\u0176\u0177\u0179\u017a";
		// Cyrillic characters
		oldChars[1] = "AaEeIiOoUuYyBbVvPpKkSsNnDdFfGgHhJjLlMmQqRrTtWwZzCcXx";
		newChars[1] = "\u0410\u0430\u0415\u0435\u0418\u0438\u041e\u043e\u0423\u0443\u042e\u044e\u0411\u0431\u0412\u0432"
			+ "\u041f\u043f\u041a\u043a\u0421\u0441\u041d\u043d\u0414\u0434\u0424\u0444\u0413\u0433\u0425\u0445\u0419\u0439"
			+ "\u041b\u043b\u041c\u043c\u0428\u0448\u0420\u0440\u0422\u0442\u042f\u044f\u0417\u0437\u0426\u0446\u0429\u0449";
		// Arabic characters
		oldChars[2] = "AaBbTtGgHhDdRrMmNnLlQqKkSsVvWwXxYyZzCcFfJjPpEeIiOoUu0123456789%?;,";
		newChars[2] = "\u0627\u0627\u0628\u0628\u062a\u062a\u062c\u062c\u062d\u062d\u062f\u062f\u0631\u0631\u0645\u0645"
			+ "\u0646\u0646\u0644\u0644\u0642\u0642\u0643\u0643\u0633\u0633\u062e\u062e\u0648\u0648\u0632\u0632\u064a\u064a"
			+ "\u0638\u0638\u0635\u0635\u0641\u0641\u063a\u063a\u0630\u0630\u0647\u0647\u0639\u0639\u0636\u0636\u0634\u0634"
			+ "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669\u066a\u061f\u061b\u060c";
		// Han characters (Simplified)
		oldChars[3] = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
		newChars[3] = "\u35F2\u35F2\u3737\u3737\u3DFF\u3DFF\u4039\u4039\u4150\u4150\u42D4\u42D4\u6E26\u6E26\u6E88\u6E88"
			+ "\u6EB3\u6EB3\u6F38\u6F38\u6F70\u6F70\u6FAE\u6FAE\u6FF0\u6FF0\u7121\u7121\u7189\u7189\u71D2\u71D2\u721B\u721B"
			+ "\u7258\u7258\u7372\u7372\u73FC\u73FC\u74DA\u74DA\u7587\u7587\u760D\u760D\u93C7\u93C7\u93F7\u93F7\u9F7E\u9F7E";
	}
	
	@StepParameterMapping(parameterType = StepParameterType.TARGET_LOCALE)
	public void setTargetLocale (LocaleId targetLocale) {
		this.targetLocale = targetLocale;
	}
	
	@Override
	public String getName () {
		return "Text Modification";
	}

	@Override
	public String getDescription () {
		return "Apply various modifications to the text units content of a document."
			+ " Expects: filter events. Sends back: filter events.";
	}

	@Override
	public Parameters getParameters () {
		return params;
	}

	@Override
	public void setParameters (IParameters params) {
		this.params = (Parameters)params;
	}
 
	@Override
	protected Event handleTextUnit (Event event) {
		ITextUnit tu = event.getTextUnit();
		// Skip non-translatable
		if ( !tu.isTranslatable() ) return event;
		// Skip if already translate (only if required)
		if ( !params.getApplyToExistingTarget() && tu.hasTarget(targetLocale) ) return event;
		// Check if we need to apply to blank entries
		if ( !params.getApplyToBlankEntries() ) {
			TextContainer tc = tu.getTarget(targetLocale);
			if ( tc == null ) tc = tu.getSource();
			if ( !tc.hasText() ) return event;
		}

		// Create the target if needed
		tu.createTarget(targetLocale, false, IResource.COPY_ALL);
		// If the target is empty we use the source
		if ( tu.getTarget(targetLocale).isEmpty() ) {
			tu.createTarget(targetLocale, true, IResource.COPY_ALL);
		}
		else if ( !tu.getTarget(targetLocale).hasText() ) {
			// Not empty but has no text: it's likely a copy of the inline codes only
			// use the source in that case
			tu.createTarget(targetLocale, true, IResource.COPY_ALL);
		}

		// Perform the main modification
		switch ( params.getType()) {
		case Parameters.TYPE_XNREPLACE:
			replaceWithXN(tu);
			break;
		case Parameters.TYPE_EXTREPLACE:
			replaceWithExtendedChars(tu);
			break;
		case Parameters.TYPE_KEEPINLINE:
			removeText(tu);
			break;
		}
		
		// Expand if needed
		if ( params.getExpand()) {
			expand(tu);
		}
		
		// Add segment marks if needed
		if ( params.getMarkSegments()) {
			addSegmentMarks(tu);
		}

		// Add prefixes and suffixes to the paragraph if needed
		if ( params.getAddPrefix() || params.getAddSuffix() || params.getAddName() || params.getAddID() ) {
			addText(tu);
		}
		
		return event;
	}

	/**
	 * Removes the text but leaves the inline code.
	 * @param tu the text unit to process.
	 */
	private void removeText (ITextUnit tu) {
		for ( TextPart part : tu.getTarget(targetLocale) ) {
			StringBuilder sb = new StringBuilder();
			// Remove the text inside the part
			String text = part.text.getCodedText();
			for ( int i=0; i SCRIPT_MAX ) {
			charDest = 0; // Just making sure
		}
		
		for ( TextPart part : tu.getTarget(targetLocale) ) {
			StringBuilder sb = new StringBuilder(part.text.getCodedText());
			for ( int i=0; i -1 ) {
						sb.setCharAt(i, newChars[charDest].charAt(n));
					}
				}
			}
			part.text.setCodedText(sb.toString());
		}
	}

	private void addSegmentMarks (ITextUnit tu) {
		for ( Segment seg : tu.getTarget(targetLocale).getSegments() ) {
			seg.text.setCodedText(STARTSEG+seg.text.getCodedText()+ENDSEG);
		}
	}
	
	/**
	 * Adds prefix and/or suffix to the target. This method assumes that
	 * the item has gone through the first transformation already.
	 * @param tu The text unit to process.
	 */
	private void addText (ITextUnit tu) {
		if ( params.getAddPrefix() ) {
			TextFragment firstFrag = tu.getTarget(targetLocale).getFirstContent();
			firstFrag.setCodedText(params.getPrefix() + firstFrag.getCodedText());
		}
		TextFragment lastFrag = tu.getTarget(targetLocale).getLastContent();
		if ( params.getAddName() ) {
			String name = tu.getName();
			if ( !Util.isEmpty(name) ) {
				lastFrag.setCodedText(lastFrag.getCodedText() + "_"+name);
			}
			else {
				lastFrag.setCodedText(lastFrag.getCodedText() + "_"+tu.getId());
			}
		}
		if ( params.getAddID() ) {
			lastFrag.setCodedText(lastFrag.getCodedText() + "_"+tu.getId());
		}
		if ( params.getAddSuffix() ) {
			lastFrag.setCodedText(lastFrag.getCodedText() + params.getSuffix());
		}
	}

	private void expand (ITextUnit tu) {
		// Get the total length of the original
		int length = getLength(tu.getSource());
		// Calculate the number of characters to add
		int addition = length; // 100% for long strings
		if ( length <= 20 ) { // 50% (or at least 1 char) for short strings
			addition = (addition+1) / 2; 
		}
		
		// Create the string to add
		StringBuilder extra = new StringBuilder();
		for ( int i=0; i -1 ) {
			StringBuilder tmp = new StringBuilder(ct);
			tmp.insert(p, extra);
			frag.setCodedText(tmp.toString());
		}
		else {
			frag.setCodedText(ct+extra);
		}
	}
	
	private int getLength (TextContainer tc) {
		TextFragment tf;
		if ( tc.contentIsOneSegment() ) {
			tf = tc.getFirstContent();
		}
		else {
			tf = tc.getUnSegmentedContentCopy();
		}
		return TextUnitUtil.getText(tf).length();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy