All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.rainbowkit.xliff.XLIFF2PackageWriter Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
/*===========================================================================
  Copyright (C) 2011-2014 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.rainbowkit.xliff;

import java.io.File;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.IResource;
import net.sf.okapi.common.ISkeleton;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.ZipUtil;
import net.sf.okapi.common.annotation.*;
import net.sf.okapi.common.annotation.Note.Annotates;
import net.sf.okapi.common.annotation.Note.Priority;
import net.sf.okapi.common.annotation.NoteAnnotation;
import net.sf.okapi.common.query.MatchType;
import net.sf.okapi.common.query.QueryResult;
import net.sf.okapi.common.resource.Code;
import net.sf.okapi.common.resource.DocumentPart;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.StartGroup;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextPart;
import net.sf.okapi.filters.rainbowkit.Manifest;
import net.sf.okapi.filters.rainbowkit.MergingInfo;
import net.sf.okapi.lib.xliff2.Const;
import net.sf.okapi.lib.xliff2.core.CTag;
import net.sf.okapi.lib.xliff2.core.Fragment;
import net.sf.okapi.lib.xliff2.core.IWithNotes;
import net.sf.okapi.lib.xliff2.core.Note.AppliesTo;
import net.sf.okapi.lib.xliff2.core.Part;
import net.sf.okapi.lib.xliff2.core.StartFileData;
import net.sf.okapi.lib.xliff2.core.StartGroupData;
import net.sf.okapi.lib.xliff2.core.StartXliffData;
import net.sf.okapi.lib.xliff2.core.Store;
import net.sf.okapi.lib.xliff2.core.Unit;
import net.sf.okapi.lib.xliff2.its.Domain;
import net.sf.okapi.lib.xliff2.its.ITSWriter;
import net.sf.okapi.lib.xliff2.matches.Match;
import net.sf.okapi.lib.xliff2.writer.XLIFFWriter;
import net.sf.okapi.steps.rainbowkit.common.BasePackageWriter;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class XLIFF2PackageWriter extends BasePackageWriter {

	public static final String POBJECTS_DIR = "pobjects";

	private static final String TU_PREFIX = "$tu$";
	private final Logger LOGGER = LoggerFactory.getLogger(getClass());
	
	private XLIFFWriter writer;
	private LinkedHashMap referents;
	private XLIFF2Options options;
	private String rawDocPath;
	private LocaleId trgLoc;

	public XLIFF2PackageWriter () {
		super(Manifest.EXTRACTIONTYPE_XLIFF2);
	}

	@Override
	protected void processStartBatch () {
		// Get the options from the parameters
		options = new XLIFF2Options();
		if ( !Util.isEmpty(params.getWriterOptions()) ) {
			options.fromString(params.getWriterOptions());
		}

		if ( options.getCreateTipPackage() ) {
			manifest.setGenerateTIPManifest(true);
			manifest.setSubDirectories(POBJECTS_DIR+"/input", POBJECTS_DIR+"/bilingual", POBJECTS_DIR+"/bilingual",
				POBJECTS_DIR+"/output", POBJECTS_DIR+"/tm", POBJECTS_DIR+"/skeleton", false);
		}
		else {
			manifest.setSubDirectories("original", "work", "work", "done", null, "skeleton", false);
		}

		// Create TM only for TIP package
		setTMXInfo(options.getCreateTipPackage(), null, false, false, false);
		super.processStartBatch();
	}
	
	// For final zip 
	public boolean getCreeatTipPackage () {
		return options.getCreateTipPackage();
	}
	
	@Override
	protected void processEndBatch () {
		// Base process
		super.processEndBatch();
		
		// TIP-specific process
		if ( options.getCreateTipPackage() ) {
			// Gather the list of TMs created
			ArrayList tms = new ArrayList<>();
			if ( tmxWriterApproved != null ) {
				if ( tmxWriterApproved.getItemCount() > 0 ) {
					tms.add(Util.getFilename(tmxPathApproved, true));
				}
			}
			if ( tmxWriterAlternates != null ) {
				if ( tmxWriterAlternates.getItemCount() > 0 ) {
					tms.add(Util.getFilename(tmxPathAlternates, true));
				}
			}
			if ( tmxWriterLeverage != null ) {
				if ( tmxWriterLeverage.getItemCount() > 0 ) {
					tms.add(Util.getFilename(tmxPathLeverage, true));
				}
			}
			if ( tmxWriterUnApproved != null ) {
				if ( tmxWriterUnApproved.getItemCount() > 0 ) {
					tms.add(Util.getFilename(tmxPathUnApproved, true));
				}
			}

			// Save the TIP manifest
			manifest.saveTIPManifest(manifest.getTempPackageRoot(), tms);

			// Zip the project files
			String dir = manifest.getTempPackageRoot()+POBJECTS_DIR;

			ZipUtil.zipDirectory(dir, ".zip");
			
			// Delete the original
			Util.deleteDirectory(dir, false);
			// The creation of the .tipp file is done at the step level
			// otherwise to be done after the directory is freed from locks
		}
	}
	
	@Override
	protected void processStartDocument (Event event) {
		super.processStartDocument(event);
		
		writer = new XLIFFWriter();
		referents = new LinkedHashMap<>();

		MergingInfo item = manifest.getItem(docId);
		rawDocPath = manifest.getTempSourceDirectory() + item.getRelativeInputPath() + ".xlf";
		// Set the writer's options
		writer.setWithOriginalData(options.getwithOriginalData());
		writer.setUseIndentation(true);
		// Create the writer
		trgLoc = manifest.getTargetLocale();
		Util.createDirectories(rawDocPath); //TODO: This should be done by the writer. To change when it's implemented properly.
		writer.create(new File(rawDocPath), manifest.getSourceLocale().toBCP47(), trgLoc.toBCP47());
		StartXliffData sxd = new StartXliffData(null);
		if (options.getIncludeIts()) {
		    ITSWriter.addDeclaration(sxd);
		}
		writer.writeStartDocument(sxd, null);
		// Original: use the document name if there is one (null is allowed)
		// For now we don't set ID for the files, the writer will generate them 
		StartFileData sfd = new StartFileData(null);
		sfd.setOriginal(event.getStartDocument().getName());
		writer.setStartFileData(sfd);
	}
	
	@Override
	protected Event processEndDocument (Event event) {
		writer.writeEndDocument();
		writer.close();
		writer = null;
		referents.clear();
		referents = null;
		
		if ( params.getSendOutput() ) {
			return super.creatRawDocumentEventSet(rawDocPath, "UTF-8",
				manifest.getSourceLocale(), manifest.getTargetLocale());
		}
		else {
			return event;
		}
	}

	@Override
	protected void processStartSubDocument (Event event) {
		// Do not start one explicitly
		// Let the first unit to trigger the start of the file
		// otherwise we may get empty file elements
		// One thing to do: set the original (case of the DOCX-type documents with sub-documents)
		StartFileData sfd = new StartFileData(null);
		sfd.setOriginal(event.getStartSubDocument().getName());
		writer.setStartFileData(sfd);
	}
	
	@Override
	protected void processEndSubDocument (Event event) {
		// Safe to call even if writestartFile() was not called
		writer.writeEndFile();
	}
	
	@Override
	protected void processStartGroup (Event event) {
	    	// BasePackageWriter calls this method when the Event is START_SUBFILTER also.
	    	// So we need to make sure we are handling START_GROUP.
	    	if (event.isStartGroup()) {
	    	    StartGroup sg = event.getStartGroup();
	    	    StartGroupData sgd = toXLIFF2StartGroupData(sg);
	    	    writer.writeStartGroup(sgd);
	    	} else {
	    	    writer.writeStartGroup(null);
	    	}
	}
	
	@Override
	protected void processEndGroup (Event event) {
		writer.writeEndGroup();
	}
	
	@Override
	protected void processTextUnit (Event event) {
		ITextUnit tu = event.getTextUnit();
		if ( tu.isReferent() ) {
			storeReferent(tu);
		}
		Unit unit = toXLIFF2Unit(tu);
		writer.writeUnit(unit);
		writeTMXEntries(event.getTextUnit());
	}
	
	@Override
	protected void processDocumentPart (Event event) {
		DocumentPart dp = event.getDocumentPart();
		if ( dp.isReferent() ) {
			storeReferent(dp);
		}
	}

	private void storeReferent (IResource res) {
		ISkeleton skel = res.getSkeleton();
		if ( skel == null ) return;
		if ( res instanceof ITextUnit ) {
			referents.put(res.getId(), TU_PREFIX+skel.toString());
		}
		else {
			referents.put(res.getId(), skel.toString());
		}
	}

	/**
	 * Gets the text unit id of the referenced objects.
	 * @param text the initial skeleton string.
	 * @return a list of IDs or empty
	 */
	private String getReferences (String text) {
		if ( text == null ) return null;
		StringBuilder tmp = new StringBuilder();
		StringBuilder data = new StringBuilder(text);
		Object[] res = null;
		do {
			// Check if that data has a reference marker
			res = TextFragment.getRefMarker(data);
			if ( res != null ) {
				String refId = (String)res[0];
				if ( !refId.equals("$self$") ) {
					String skel = referents.get(refId);
					if ( skel != null ) {
						if ( !skel.startsWith(TU_PREFIX) ) {
							String refs = getReferences(skel);
							if ( refs != null ) {
								tmp.append(refs+" ");
							}
						}
						else { // text unit
							tmp.append(refId+" ");
						}
					}
					else {
						tmp.append(refId+" ");
					}
				}
				// Remove this and check for next
				data.delete((Integer)res[1], (Integer)res[2]);
			}
		}
		while ( res != null );
		return tmp.toString().trim(); 
	}
	
	@Override
	public void close () {
		if ( writer != null ) {
			writer.close();
			writer = null;
		}
	}

	@Override
	public String getName () {
		return getClass().getName();
	}
	
	/**
	 * Converts the objects handling Notes from XLIFF 1.2 based class to XLIFF 2 based class.
	 * Attributes are mapped according to the following table.
	 * 
	 *  
	 * 
	 *          
	 * priority  
	 * 
* XLIFF 1.2 XLIFF 2 Value Mapping Rules
annotates appliesTo identical if not "general"; empty if "general"
from category identical
priority identical
> * @param xl12Notes The XLIFF 1.2's way to keep multiple notes. (Input) * @param xl2NotesAware The XLIFF 2's way to keep multiple notes. (Output) */ private void convertNotesFromXliff12ToXliff2(NoteAnnotation xl12Notes, IWithNotes xl2NotesAware) { if (xl12Notes != null) { for (Note n1 : xl12Notes) { net.sf.okapi.lib.xliff2.core.Note n2 = new net.sf.okapi.lib.xliff2.core.Note(n1.getNoteText()); Annotates an = n1.getAnnotates(); if (an != null) { switch (an) { case SOURCE: n2.setAppliesTo(AppliesTo.SOURCE); break; case TARGET: n2.setAppliesTo(AppliesTo.TARGET); break; case GENERAL: ; break; } } String fr = n1.getFrom(); if (fr != null && !fr.isEmpty()) { n2.setCategory(fr); } Priority pr = n1.getPriority(); if (pr != null) { n2.setPriority(pr.value()); } xl2NotesAware.addNote(n2); } } } /** * Convert XLIFF 1.2 based Okapi StartGroup to XLIFF 2.0 based StartGroupData. *
* Caveat: This is implemented to support Notes. Only minimum information required * for this purpose is copied. * Unlike toXLIFF2Unit, this method only supports annotation saved in XLIFF2NoteAnnotation. * Annotation saved in GenericAnnotations or Property objects is ignored. * @param sg XLIFF 1.2 based resource for the START_GROUP event * @return XLIFF 2.0 based resource */ protected StartGroupData toXLIFF2StartGroupData(StartGroup sg) { // Not sure why it should be protected rather than private. I'm just following toXLIFF2Unit. (Kuro) StartGroupData sgd = new StartGroupData(sg.getId()); sgd.setName(sg.getName()); convertNotesFromXliff12ToXliff2(sg.getAnnotation(NoteAnnotation.class), sgd); return sgd; } protected Unit toXLIFF2Unit (ITextUnit tu) { Unit unit = new Unit(tu.getId()); boolean doEliminateEmptyTargetsWithNonEmptySource = options.getEliminateEmptyTargetsWithNonEmptySource(); TextContainer srcTc = tu.getSource(); TextContainer trgTc = null; if ( tu.hasTarget(manifest.getTargetLocale()) ) { trgTc = tu.getTarget(manifest.getTargetLocale()); if ( trgTc.getSegments().count() != srcTc.getSegments().count() ) { // Use un-segmented entry if we have different number of segments LOGGER.warn("Text unit id='{}' has different number of segments in source and target.\n" +"This entry will be output un-segmented.", tu.getId()); srcTc = tu.getSource().clone(); srcTc.joinAll(); trgTc = tu.getTarget(manifest.getTargetLocale()).clone(); trgTc.joinAll(); } } if ( !Util.isEmpty(tu.getType()) ) { unit.setType("okp:"+tu.getType().replace(':', '-')); } if ( !Util.isEmpty(tu.getName()) ) { unit.setName(tu.getName()); } unit.setTranslate(tu.isTranslatable()); // Add trans-unit level note if needed boolean noteDone = false; GenericAnnotations anns = tu.getAnnotation(GenericAnnotations.class); if ( anns != null ) { GenericAnnotation ga = anns.getFirstAnnotation(GenericAnnotationType.LOCNOTE); if ( ga != null ) { net.sf.okapi.lib.xliff2.core.Note note = new net.sf.okapi.lib.xliff2.core.Note(ga.getString(GenericAnnotationType.LOCNOTE_VALUE), net.sf.okapi.lib.xliff2.core.Note.AppliesTo.UNDEFINED); if ( !"alert".equals(ga.getString(GenericAnnotationType.LOCNOTE_TYPE)) ) note.setPriority(2); unit.addNote(note); noteDone = true; } } // Transfer notes stored in NoteAnnotation. This is the new way of representing Notes. convertNotesFromXliff12ToXliff2(tu.getAnnotation(NoteAnnotation.class), unit); if ( anns != null ) { // Storage Size // GenericAnnotation ga = anns.getFirstAnnotation(GenericAnnotationType.STORAGESIZE); // if ( ga != null ) { // unit.getExtAttributes().setAttribute(Names.NS_ITS, "storageSize", // ga.getString(GenericAnnotationType.STORAGESIZE_SIZE)); // String tmp = ga.getString(GenericAnnotationType.STORAGESIZE_ENCODING); // if ( !tmp.equals("UTF-8") ) { // unit.getExtAttributes().setAttribute(Names.NS_ITS, "storageEncoding", tmp); // } // tmp = ga.getString(GenericAnnotationType.STORAGESIZE_LINEBREAK); // if ( !tmp.equals("lf") ) { // unit.getExtAttributes().setAttribute(Names.NS_ITS, "lineBreakType", tmp); // } // } // Domain GenericAnnotation ga = anns.getFirstAnnotation(GenericAnnotationType.DOMAIN); if ( ga != null ) { unit.getITSItems().add(new Domain(ga.getString(GenericAnnotationType.DOMAIN_VALUE))); } // Allowed characters ga = anns.getFirstAnnotation(GenericAnnotationType.ALLOWEDCHARS); if ( ga != null ) { unit.getExtAttributes().setAttribute(Const.NS_ITS, "allowedCharacters", ga.getString(GenericAnnotationType.ALLOWEDCHARS_VALUE)); } // // External Resource reference // ga = anns.getFirstAnnotation(GenericAnnotationType.EXTERNALRES); // if ( ga != null ) { // unit.getExtAttributes().setAttribute(Namespaces.ITSXLF_NS_URI, "externalResourceRef", // ga.getString(GenericAnnotationType.EXTERNALRES_VALUE)); // } } unit.setTranslate(tu.isTranslatable()); // Go through the parts: Use the source to drive the order // But match on segment ids TextPart part; ISegments trgSegs = null; if ( trgTc != null ) { trgSegs = trgTc.getSegments(); } int srcSegIndex = -1; for ( int i=0; i codes = tf.getCodes(); int index; Code code; boolean mayOverlapDefault = false; // Most spanning codes may not overlap for ( int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy