eu.project.ttc.utils.AlignerUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
There is a newer version: 3.0.10
/*******************************************************************************
 * Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.utils;

import java.util.Collection;
import java.util.Set;

import com.google.common.collect.Sets;

import eu.project.ttc.models.ContextVector;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.index.CustomTermIndex;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.models.index.TermMeasure;
import eu.project.ttc.resources.BilingualDictionary;

public class AlignerUtils {
	public static final int TRANSLATION_STRATEGY_PRORATA = 1;
	public static final int TRANSLATION_STRATEGY_MOST_FREQUENT = 2;
	public static final int TRANSLATION_STRATEGY_MOST_SPECIFIC = 3;
	private static final int TRANSLATION_STRATEGY_EQUI_REPARTITION = 4;
	
	
	/**
	 *
	 * Translates all {@link ContextVector} components (i.e. its coTerms) into
	 * the target language of this aligner by the mean of one of the available 
	 * strategy :
	 *  - {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_FREQUENT}
	 *  - {@link AlignerUtils#TRANSLATION_STRATEGY_PRORATA}
	 *  - {@link AlignerUtils#TRANSLATION_STRATEGY_EQUI_REPARTITION} 
	 *  - {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_SPECIFIC} 
	 *
	 * @see BilingualDictionary
	 * @param sourceVector
	 * 			The source context vector object to be translated into target language
	 * @param dictionary
	 * 			The dico used in the translation process
	 * @param translationStrategy
	 * 			The translation strategy of the sourceVector. 
	 * 			Two possible values: {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_FREQUENT}
	 * 							     {@link AlignerUtils#TRANSLATION_STRATEGY_PRORATA} 
	 * 							     {@link AlignerUtils#TRANSLATION_STRATEGY_EQUI_REPARTITION} 
	 * 							     {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_SPECIFIC} 
	 * @return
	 * 			The translated context vector
	 */
	public static ContextVector translateVector(ContextVector sourceVector, 
			BilingualDictionary dictionary, int translationStrategy, TermIndex targetTermino) {
		ContextVector targetVector = new ContextVector();
		CustomTermIndex swtLemmaIndex = targetTermino.getCustomIndex(TermIndexes.SINGLE_WORD_LEMMA);
		
		for(ContextVector.Entry entry:sourceVector.getEntries()) {
			Set translations = Sets.newHashSet();
			for(String targetLemma:dictionary.getTranslations(entry.getCoTerm().getLemma())) {
				Collection translatedTerms = swtLemmaIndex.getTerms(targetLemma);
				if(!translatedTerms.isEmpty()) 
					translations.add(translatedTerms.iterator().next());
			}
			switch (translationStrategy) {
			case TRANSLATION_STRATEGY_PRORATA:
				fillTargetVectorSProrata(targetVector, entry, translations);
				break;
			case TRANSLATION_STRATEGY_MOST_FREQUENT:
				fillTargetVectorSMost(targetVector, entry, translations, targetTermino.getFrequencyMeasure());
				break;
			case TRANSLATION_STRATEGY_MOST_SPECIFIC:
				fillTargetVectorSMost(targetVector, entry, translations, targetTermino.getWRMeasure());
				break;
			case TRANSLATION_STRATEGY_EQUI_REPARTITION:
				fillTargetVectorSEquiRepartition(targetVector, entry, translations);
				break;
			default:
				throw new IllegalArgumentException("Invalid translation strategy: " + translationStrategy);
			}
		}
		return targetVector;
	}
	

	/**
	 * This method implements the strategy {@link #TRANSLATION_STRATEGY_PRORATA} 
	 * for context vector translation.
	 * 
	 * Explanation of strategy:
	 * 
	 * Example of source term in french : chat 
	 * 
	 * Example of candidate translations for "noir" from dico: black, dark
	 * Example of candidate translations for "chien" from dico: dog
	 * 
	 * Suppose that frequencies in target term index are : 
	 *   - black : 35
	 *   - dark : 15
	 *   - dog : 7
	 *   
	 * The translated vector would be : 
	 * 
	 * because :
	 *   - total frequency in target term index for term "noir" is 35 + 15 = 50,
	 *     and 7 = ( 35 / 50 ) * 10 for "black"
	 *     and 3 = ( 15 / 50 ) * 10 for "dark"
	 *   - total frequency in target term index for term "dog" is 7,
	 *     and 3 = ( 7 / 7 ) * 3
	 *     
	 * 
	 * @param translatedVector
	 * 			the target vector to be fill 
	 * @param sourceTermEntry
	 * 			the source vector's component to translated and add to target vector
	 * @param candidateTranslations
	 * 			the candidate translations of the sourceTermEntry given by the
	 * 			bilingual dictionary.
	 */
	private static void fillTargetVectorSProrata(ContextVector translatedVector,
			ContextVector.Entry sourceTermEntry, Set candidateTranslations) {
		/*
		 * Do the cross product of translation frequencies
		 */
		int totalFreqInTargetTermino = 0;
		for(Term tt : candidateTranslations) 
			totalFreqInTargetTermino += tt.getFrequency();
		
		for(Term targetTerm:candidateTranslations) {
			int prorataCooccs = targetTerm.getFrequency() * sourceTermEntry.getNbCooccs() / totalFreqInTargetTermino;
			translatedVector.addEntry(targetTerm, prorataCooccs, sourceTermEntry.getAssocRate());
		}
	}
	
	/**
	 * This method implements the {@value #TRANSLATION_STRATEGY_MOST_FREQUENT} 
	 * strategy for context vector translation.
	 * 
	 * 
	 * Explanation of strategy:
	 * 
	 * Example of source term in french : chat 
	 * 
	 * Example of candidate translations for "noir" from dico: black, dark
	 * Example of candidate translations for "chien" from dico: dog
	 * 
	 * Suppose that frequencies in target term index are : 
	 *   - black : 35
	 *   - dark : 15
	 *   - dog : 7
	 *   
	 * The translated vector would be : 
	 * 
	 * @param translatedVector
	 * 			the target vector to be fill 
	 * @param sourceTermEntry
	 * 			the source vector's component to translated and add to target vector
	 * @param candidateTranslations
	 * 			the candidate translations of the sourceTermEntry given by the
	 * 			bilingual dictionary.
	 * @param termMeasure 
	 * 
	 */
	private static void fillTargetVectorSMost(ContextVector translatedVector,
			ContextVector.Entry sourceTermEntry, Set candidateTranslations, TermMeasure termMeasure) {
		fillTargetVectorWithMostProperty(translatedVector, sourceTermEntry,
				candidateTranslations, termMeasure);
	}
	
	
	/**
	 * 
	 * Explanation of strategy:
	 * 
	 * Example of source term in french : chat 
	 * 
	 * Example of candidate translations for "noir" from dico: black, dark
	 * Example of candidate translations for "chien" from dico: dog
	 * 
	 *   
	 * The translated vector would be : 
	 * 
	 * @param translatedVector
	 * @param sourceTermEntry
	 * @param candidateTranslations
	 */
	private static  void fillTargetVectorSEquiRepartition(ContextVector translatedVector,
			ContextVector.Entry sourceTermEntry, Set candidateTranslations) {
		/*
		 * Do the cross product of translation frequencies
		 */
		for(Term targetTerm:candidateTranslations) {
			int nbCooccs = sourceTermEntry.getNbCooccs()/candidateTranslations.size();
			translatedVector.addEntry(
					targetTerm, 
					nbCooccs, 
					sourceTermEntry.getAssocRate()/candidateTranslations.size());
		}
	}

	private static void fillTargetVectorWithMostProperty(
			ContextVector translatedVector,
			ContextVector.Entry sourceTermEntry,
			Set candidateTranslations, final TermMeasure measure) {
		Term mostFrequent = null;
		double maxValue = -1d;
		
		for(Term t:candidateTranslations) {
			if(measure.getValue(t)>maxValue) {
				maxValue = t.getFrequency();
				mostFrequent = t;
			}
		}
		
		if(mostFrequent != null) 
			/*
			 * mostFrequent would be null if candidateTranslations is empty
			 */
			translatedVector.addEntry(mostFrequent, sourceTermEntry.getNbCooccs(), sourceTermEntry.getAssocRate());
	}


}