Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package fr.univnantes.termsuite.alignment;
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.MinMaxPriorityQueue;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.google.inject.Inject;
import fr.univnantes.termsuite.api.TermSuiteException;
import fr.univnantes.termsuite.engines.splitter.CompoundUtils;
import fr.univnantes.termsuite.framework.SourceLanguage;
import fr.univnantes.termsuite.framework.TargetLanguage;
import fr.univnantes.termsuite.framework.service.IndexService;
import fr.univnantes.termsuite.framework.service.RelationService;
import fr.univnantes.termsuite.framework.service.TermService;
import fr.univnantes.termsuite.framework.service.TerminologyService;
import fr.univnantes.termsuite.index.TermIndex;
import fr.univnantes.termsuite.index.TermIndexType;
import fr.univnantes.termsuite.index.providers.AllComponentPairsProvider;
import fr.univnantes.termsuite.metrics.Cosine;
import fr.univnantes.termsuite.metrics.ExplainedValue;
import fr.univnantes.termsuite.metrics.Levenshtein;
import fr.univnantes.termsuite.metrics.SimilarityDistance;
import fr.univnantes.termsuite.metrics.TextExplanation;
import fr.univnantes.termsuite.model.Component;
import fr.univnantes.termsuite.model.CompoundType;
import fr.univnantes.termsuite.model.ContextVector;
import fr.univnantes.termsuite.model.RelationType;
import fr.univnantes.termsuite.model.Term;
import fr.univnantes.termsuite.model.TermProperty;
import fr.univnantes.termsuite.model.Word;
import fr.univnantes.termsuite.resources.BilingualDictionary;
import fr.univnantes.termsuite.utils.Pair;
import fr.univnantes.termsuite.utils.StringUtils;
import fr.univnantes.termsuite.utils.TermUtils;
import fr.univnantes.termsuite.utils.WordUtils;
/**
*
*
*
* @author Damien Cram
*
*/
public class BilingualAlignmentService {
private static final Logger LOGGER = LoggerFactory.getLogger(BilingualAlignmentService.class);
private static final String MSG_TERM_NOT_NULL = "Source term must not be null";
static final String MSG_REQUIRES_SIZE_2_LEMMAS = "The term %s must have exactly two single-word terms (single-word terms: %s)";
private static final String MSG_SEVERAL_VECTORS_NOT_COMPUTED = "Several terms have no context vectors in target terminology (nb terms with vector: {}, nb terms without vector: {})";
@Inject
private BilingualDictionary dico;
@Inject
@SourceLanguage
private TerminologyService sourceTermino;
@Inject
@TargetLanguage
private TerminologyService targetTermino;
@Inject
@SourceLanguage
private IndexService sourceIndexes;
@Inject
@TargetLanguage
private IndexService targetIndexes;
@Inject(optional=true)
private SimilarityDistance distance = new Cosine();
@Inject(optional=true)
private Map manualDico = new HashMap<>();
public BilingualAlignmentService addTranslation(Term sourceTerm, Term targetTerm) {
Preconditions.checkNotNull(sourceTerm);
Preconditions.checkNotNull(targetTerm);
manualDico.put(sourceTerm, targetTerm);
return this;
}
/**
*
* @param sourceLemma
* @param targetLemmas
* @return
*/
public BilingualAlignmentService addTranslation(String sourceLemma, String targetLemma) {
if(sourceIndexes.getIndex(TermIndexType.WORD_LEMMAS).getTerms(sourceLemma).isEmpty())
throw new TermSuiteException("No term found in source termino with lemma: " + sourceLemma);
else if(targetIndexes.getIndex(TermIndexType.WORD_LEMMAS).getTerms(targetLemma).isEmpty())
throw new TermSuiteException("No term found in target termino with lemma: " + targetLemma);
else {
for(Term sourceTerm:sourceIndexes.getIndex(TermIndexType.WORD_LEMMAS).getTerms(sourceLemma))
for(Term targetTerm:targetIndexes.getIndex(TermIndexType.WORD_LEMMAS).getTerms(targetLemma))
manualDico.put(sourceTerm, targetTerm);
}
return this;
}
/**
*
* Translates the source term with the help of the dictionary
* and computes the list of contextSize closest candidate
* terms in the target terminology.
*
* sourceTerm's context vector must be computed and normalized,
* as well as all terms' context vectors in the target term index.
*
* @param sourceTerm
* the term to align with target term index
* @param nbCandidates
* the number of {@link TranslationCandidate} to return in the returned list
* @param minCandidateFrequency
* the minimum frequency of a target candidate
* @return
* A sorted list of {@link TranslationCandidate} sorted by distance desc. Each
* {@link TranslationCandidate} is a container for a target term index's term
* and its translation score.
*
*/
public List alignDicoThenDistributional(TermService sourceTerm, int nbCandidates, int minCandidateFrequency) {
checkNotNull(sourceTerm);
List dicoCandidates = Lists.newArrayList();
/*
* 1- find direct translation of the term in the dictionary
*/
dicoCandidates.addAll(sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, alignDico(sourceTerm, Integer.MAX_VALUE)));
applySpecificityBonus(dicoCandidates);
/*
* 2- align against all terms in the corpus
*/
List alignedCandidateQueue = alignDistributional(sourceTerm, nbCandidates,
minCandidateFrequency);
/*
* 3- Merge candidates
*/
List mergedCandidates = dicoCandidates;
mergedCandidates.addAll(alignedCandidateQueue);
Collections.sort(mergedCandidates);
/*
* 4- Sort, truncate, and normalize
*/
List sortedTruncateedNormalized = sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, mergedCandidates);
return sortedTruncateedNormalized;
}
public boolean canAlignNeoclassical(TermService sourceTerm) {
return sourceTerm.isCompound()
&& sourceTerm.getWords().get(0).getWord().getCompoundType() == CompoundType.NEOCLASSICAL;
}
public List alignNeoclassical(Term sourceTerm, int nbCandidates,
int minCandidateFrequency) {
return alignNeoclassical(sourceTermino.asTermService(sourceTerm), nbCandidates, minCandidateFrequency);
}
/**
*
* Align a term using the TermSuite's neoclassical alignment method.
*
* This method behaves as follows:
*
*
* @param sourceTerm
* the source term to align
* @param nbCandidates
* the maximum number of {@link TranslationCandidate} returned
* @param minCandidateFrequency
* the minimum frequency of returned translation candidates
* @return
* the sorted list {@link TranslationCandidate} produced by this method
* or an empty list if the term could not be aligned using the neoclassical method.
*
* @see #canAlignNeoclassical(Term)
* @see CompoundType#NEOCLASSICAL
*
*/
public List alignNeoclassical(TermService sourceTerm, int nbCandidates,
int minCandidateFrequency) {
if(!canAlignNeoclassical(sourceTerm))
return Lists.newArrayList();
Word sourceWord = sourceTerm.getWords().get(0).getWord();
Component sourceNeoclassicalAffix = sourceWord.getNeoclassicalAffix();
String sourceNeoclassicalAffixString = WordUtils.getComponentSubstring(sourceWord, sourceNeoclassicalAffix);
/*
* 1. try to translate the neoclassical affix
*
* E.g. aéro (fr) -> aero (en)
*/
Set targetNeoclassicalAffixes = Sets.newHashSet();
// 1a. find translation in dico
targetNeoclassicalAffixes.addAll(dico.getTranslations(sourceNeoclassicalAffixString));
// some dicos also appends the hyphen to affixes
targetNeoclassicalAffixes.addAll(dico.getTranslations(sourceNeoclassicalAffixString+"-"));
// clean hyphens returned by dicos
targetNeoclassicalAffixes = targetNeoclassicalAffixes.stream()
.map(affix-> affix.replaceAll("^-", "").replaceAll("-$", ""))
.collect(Collectors.toSet());
/*
* 2. Index target candidates by morphological extensions when the extension
* is a valid swt in the target termino.
*
* E.g. électricité -> hydroélectricité
*/
Map targetCandidatesBySWTExtension = Maps.newHashMap();
Set targetCandidatesHavingSameAffix = Sets.newHashSet();
for(TermService targetCandidate:targetTermino.getTerms()) {
Word targetCompound = targetCandidate.getWords().get(0).getWord();
if(targetCandidate.isCompound() && targetCompound.getCompoundType() == CompoundType.NEOCLASSICAL) {
String targetNeoclassicalAffixString = WordUtils.getComponentSubstring(targetCompound, targetCompound.getNeoclassicalAffix());
boolean isValidTargetCandidate = false;
// Case1: we have translations from dico for neoclassical affix
if(!targetNeoclassicalAffixes.isEmpty())
isValidTargetCandidate = targetNeoclassicalAffixes.contains(targetNeoclassicalAffixString);
// Case2: we don't, then we have to test validity on graphical pure graphical equality
else
isValidTargetCandidate = StringUtils
.replaceAccents(targetNeoclassicalAffixString).toLowerCase()
.equals(StringUtils.replaceAccents(sourceNeoclassicalAffixString).toLowerCase());
if (isValidTargetCandidate) {
targetCandidatesHavingSameAffix.add(targetCandidate);
Collection targetExtensions = getMorphologicalExtensionsAsTerms(
targetIndexes.getIndex(TermIndexType.LEMMA_LOWER_CASE),
targetCandidate,
targetCompound.getNeoclassicalAffix());
for (Term morphologicalExtensin : targetExtensions)
targetCandidatesBySWTExtension.put(morphologicalExtensin, targetCandidate.getTerm());
}
}
}
/*
* 3. try recursive alignment on neoclassical extensions
*/
Collection possibleSourceExtensions = getMorphologicalExtensionsAsTerms(
sourceIndexes.getIndex(TermIndexType.LEMMA_LOWER_CASE),
sourceTerm,
sourceNeoclassicalAffix)
.stream()
.map(t->sourceTermino.asTermService(t))
.collect(toSet());
List candidates = Lists.newArrayList();
for(TermService sourceExtension:possibleSourceExtensions) {
// recursive alignment on extension
List recursiveCandidates = alignSize2(sourceExtension, nbCandidates, minCandidateFrequency);
for(TranslationCandidate extensionTranslationCandidate:recursiveCandidates) {
if(targetCandidatesBySWTExtension.containsKey(extensionTranslationCandidate.getTerm()))
candidates.add(new TranslationCandidate(
AlignmentMethod.NEOCLASSICAL,
targetCandidatesBySWTExtension.get(extensionTranslationCandidate.getTerm()),
extensionTranslationCandidate.getScore(),
sourceTerm,
extensionTranslationCandidate));
}
}
// graphical alignment on extension if no candidate
if(candidates.isEmpty())
candidates.addAll(alignGraphically(AlignmentMethod.NEOCLASSICAL, sourceTerm, nbCandidates, targetCandidatesHavingSameAffix));
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, candidates);
}
/**
* E.g. Given the compound [hydro|électricité] and the component [hydro], the method should return the
* term [électricité]
*
*
* @param termino
* @param compound
* @param component
* @return
*/
public Collection getMorphologicalExtensionsAsTerms(TermIndex lemmaLowerCaseIndex, TermService compound, Component component) {
Preconditions.checkArgument(compound.isSingleWord());
Preconditions.checkArgument(compound.isCompound());
Preconditions.checkArgument(compound.getWords().get(0).getWord().getComponents().contains(component));
Word compoundWord = compound.getWords().get(0).getWord();
LinkedList extensionComponents = Lists.newLinkedList(compoundWord.getComponents());
extensionComponents.remove(component);
if(!(component.getBegin() == 0 || component.getEnd() == compound.getLemma().length()))
return Lists.newArrayList();
Set possibleExtensionLemmas = Sets.newHashSet();
possibleExtensionLemmas.add(compound.getLemma().substring(
extensionComponents.getFirst().getBegin(),
extensionComponents.getLast().getEnd()));
if(extensionComponents.size() > 1) {
LinkedList allButLast = Lists.newLinkedList(extensionComponents);
Component last = allButLast.removeLast();
String lemma = compound.getLemma().substring(allButLast.getFirst().getBegin(), last.getBegin())
+ last.getLemma();
possibleExtensionLemmas.add(lemma);
}
List extensionTerms = Lists.newArrayList();
for(String s:possibleExtensionLemmas)
extensionTerms.addAll(lemmaLowerCaseIndex.getTerms(s.toLowerCase()));
return extensionTerms;
}
private static final Levenshtein LEVENSHTEIN = new Levenshtein();
public List alignGraphically(AlignmentMethod method, TermService sourceTerm, int nbCandidates, Collection targetTerms) {
Preconditions.checkArgument(sourceTerm.isSingleWord());
for(TermService targetTerm:targetTerms)
Preconditions.checkArgument(targetTerm.isSingleWord());
Word sourceWord = sourceTerm.getWords().get(0).getWord();
return targetTerms.stream().map(targetTerm ->
{
double dist;
Word targetWord = targetTerm.getWords().get(0).getWord();
if(sourceWord.getStem() != null
&& targetWord.getStem() != null)
dist = LEVENSHTEIN.computeNormalized(
TermUtils.stemmedInsensitiveGroupingKey(sourceTerm.getWords().get(0)),
TermUtils.stemmedInsensitiveGroupingKey(targetTerm.getWords().get(0)));
else
dist = LEVENSHTEIN.computeNormalized(
TermUtils.lemmatizedInsensitiveGroupingKey(sourceTerm.getWords().get(0)),
TermUtils.lemmatizedInsensitiveGroupingKey(targetTerm.getWords().get(0)));
return new TranslationCandidate(
method,
targetTerm.getTerm(),
dist,
sourceTerm.getTerm(),
new TextExplanation(String.format("Graphical distance(Levenshtein) is %.3f", dist)));
}
).collect(Collectors.toList());
}
public List alignDistributional(TermService sourceTerm, int nbCandidates,
int minCandidateFrequency) {
Queue alignedCandidateQueue = MinMaxPriorityQueue.maximumSize(nbCandidates).create();
ContextVector sourceVector = sourceTerm.getContext();
if(sourceVector == null)
return new ArrayList<>();
ContextVector translatedSourceVector = translateVector(
sourceVector,
dico,
TRANSLATION_STRATEGY_MOST_SPECIFIC,
targetTermino);
ExplainedValue v;
int nbVectorsNotComputed = 0;
int nbVectorsComputed = 0;
for(TermService targetTerm:targetTermino.terms().filter(TermService::isSingleWord).collect(Collectors.toList())) {
if(targetTerm.getFrequency() < minCandidateFrequency)
continue;
if(targetTerm.getContext() != null) {
nbVectorsComputed++;
v = distance.getExplainedValue(translatedSourceVector, targetTerm.getContext());
TranslationCandidate candidate = new TranslationCandidate(
AlignmentMethod.DISTRIBUTIONAL,
targetTerm.getTerm(),
v.getValue(),
sourceTerm.getTerm(),
v.getExplanation());
alignedCandidateQueue.add(candidate);
}
};
if(nbVectorsNotComputed > 0) {
LOGGER.warn(MSG_SEVERAL_VECTORS_NOT_COMPUTED, nbVectorsComputed, nbVectorsNotComputed);
}
// sort alignedCandidates
List alignedCandidates = Lists.newArrayListWithCapacity(alignedCandidateQueue.size());
alignedCandidates.addAll(alignedCandidateQueue);
normalizeCandidateScores(alignedCandidates);
return Lists.newArrayList(alignedCandidateQueue);
}
public List align(Term sourceTerm, int nbCandidates, int minCandidateFrequency) {
return alignSwtTermList(
sourceTermino.asTermService(sourceTerm).getSwts().collect(toList()),
nbCandidates,
minCandidateFrequency,
true);
}
private List alignSwtTermList(List terms, int nbCandidates, int minCandidateFrequency, boolean allowDistributionalAlignment) {
Preconditions.checkArgument(!terms.isEmpty());
if(terms.size() == 1) {
return alignSize2(terms.get(0), nbCandidates, minCandidateFrequency, allowDistributionalAlignment);
} else if(terms.size() == 2) {
String indexingKey = TermUtils.getLemmaLemmaKey(terms.get(0).getTerm(), terms.get(1).getTerm());
Optional recursiveTerm = sourceIndexes.getIndex(TermIndexType.ALLCOMP_PAIRS)
.getTerms(indexingKey)
.stream()
.filter(t -> t.getSwtSize() == 2)
.max(TermProperty.FREQUENCY.getComparator(false));
if(recursiveTerm.isPresent())
return alignSize2(
sourceTermino.asTermService(recursiveTerm.get()),
nbCandidates,
minCandidateFrequency,
allowDistributionalAlignment);
else
return Lists.newArrayList();
} else {
Collection combinedCandidates = Lists.newArrayList();
/*
* Cut the swt list in two lists
*/
for(int i=1; i<=terms.size()-1;i++) {
// cut at index i
List swtTermList1 = terms.subList(0, i);
List swtTermList2 = terms.subList(i, terms.size());
List candidates1 = alignSwtTermList(swtTermList1, nbCandidates, minCandidateFrequency, allowDistributionalAlignment);
if(!candidates1.isEmpty()) {
/*
* do not allow distributional again if it has been used for candidates one already.
*/
boolean candidates1Distributional = candidates1.get(0).getMethod() == AlignmentMethod.DISTRIBUTIONAL
|| candidates1.get(0).getMethod() == AlignmentMethod.SEMI_DISTRIBUTIONAL;
List candidates2 = alignSwtTermList(
swtTermList2,
nbCandidates,
minCandidateFrequency,
allowDistributionalAlignment && !candidates1Distributional
);
combinedCandidates.addAll(combineMWTCandidates(candidates1, candidates2, terms));
}
}
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, combinedCandidates);
}
}
public List alignSize2(Term sourceTerm, int nbCandidates, int minCandidateFrequency) {
return alignSize2(sourceTermino.asTermService(sourceTerm), nbCandidates, minCandidateFrequency);
}
/**
* alias for {@link #align(Term, int, int, true)}
*
* @param sourceTerm
* @param nbCandidates
* @param minCandidateFrequency
* @return
*/
public List alignSize2(TermService sourceTerm, int nbCandidates, int minCandidateFrequency) {
return alignSize2(sourceTerm, nbCandidates, minCandidateFrequency, true);
}
private static final String ERR_MSG_BAD_SOURCE_LEMMA_SET_SIZE = "Unexpected size for a source lemma set: %s. Expected size: 2";
/**
*
*
* @param sourceTerm
* @param nbCandidates
* @param minCandidateFrequency
* @param allowDistributionalAlignment
* @return
*/
public List alignSize2(TermService sourceTerm, int nbCandidates, int minCandidateFrequency, boolean allowDistributionalAlignment) {
Preconditions.checkNotNull(sourceTerm);
List mergedCandidates = Lists.newArrayList();
List> sourceLemmaSets = getSourceSingleLemmaTerms(sourceTerm);
for(List sourceLemmaSet:sourceLemmaSets) {
Preconditions.checkState(sourceLemmaSet.size() == 1 || sourceLemmaSet.size() == 2,
ERR_MSG_BAD_SOURCE_LEMMA_SET_SIZE, sourceLemmaSet);
if(sourceLemmaSet.size() == 1) {
if(allowDistributionalAlignment)
mergedCandidates.addAll(alignDicoThenDistributional(sourceLemmaSet.get(0), 3*nbCandidates, minCandidateFrequency));
else
mergedCandidates.addAll(alignDico(sourceLemmaSet.get(0), 3*nbCandidates));
} else if(sourceLemmaSet.size() == 2) {
List compositional = Lists.newArrayList();
try {
compositional.addAll(alignCompositionalSize2(sourceLemmaSet.get(0), sourceLemmaSet.get(1), nbCandidates, minCandidateFrequency, sourceTerm));
} catch(RequiresSize2Exception e) {
// Do nothing
}
mergedCandidates.addAll(compositional);
if(mergedCandidates.isEmpty() && allowDistributionalAlignment) {
List semiDist = Lists.newArrayList();
try {
semiDist = alignSemiDistributionalSize2Syntagmatic(
sourceLemmaSet.get(0),
sourceLemmaSet.get(1),
nbCandidates,
minCandidateFrequency,
sourceTerm);
} catch(RequiresSize2Exception e) {
// Do nothing
}
mergedCandidates.addAll(semiDist);
}
}
}
removeDuplicatesOnTerm(mergedCandidates);
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, mergedCandidates);
}
private List sortTruncateNormalizeAndMerge(TerminologyService termino, int nbCandidates, Collection candidatesCandidates) {
List list = Lists.newArrayList();
/*
* 1. Merge
*/
Multimap multimap = HashMultimap.create();
candidatesCandidates.stream().forEach(tc -> multimap.put(tc.getTerm(), tc));
multimap.keySet().stream().forEach(uniqueTerm -> {
if(multimap.get(uniqueTerm).size() >= 2) {
List termCandidates = Lists.newArrayList(multimap.get(uniqueTerm));
Collections.sort(termCandidates);
list.add(termCandidates.get(0));
} else {
list.add(multimap.get(uniqueTerm).iterator().next());
}
});
Collections.sort(list);
// set rank
for(int i = 0; i < list.size(); i++)
list.get(i).setRank(i+1);
List finalCandidates = list.subList(0, Ints.min(nbCandidates, list.size()));
normalizeCandidateScores(finalCandidates);
return finalCandidates;
}
/*
* Filter candidates by specificity
*/
private void applySpecificityBonus(List list) {
Iterator it = list.iterator();
TranslationCandidate c;
while (it.hasNext()) {
c = (TranslationCandidate) it.next();
double wr = c.getTerm().getSpecificity();
c.setScore(c.getScore()*getSpecificityBonusFactor(wr));
}
}
private double getSpecificityBonusFactor(double specificity) {
return specificity;
}
public List alignDico(TermService sourceTerm, int nbCandidates) {
List dicoCandidates = Lists.newArrayList();
if(manualDico.containsKey(sourceTerm)) {
return Lists.newArrayList((
new TranslationCandidate(
AlignmentMethod.DICTIONARY,
manualDico.get(sourceTerm),
1,
sourceTerm)
));
} else {
if(sourceTerm.getContext() != null) {
ContextVector translatedSourceVector = translateVector(
sourceTerm.getContext(),
dico,
TRANSLATION_STRATEGY_MOST_SPECIFIC,
targetTermino);
for(String candidateLemma:dico.getTranslations(sourceTerm.getLemma())) {
List terms = targetIndexes.getIndex(TermIndexType.LEMMA_LOWER_CASE).getTerms(candidateLemma);
for (Term candidateTerm : terms) {
if (candidateTerm.getContext() != null) {
TranslationCandidate candidate = new TranslationCandidate(
AlignmentMethod.DICTIONARY,
candidateTerm,
distance.getValue(translatedSourceVector, candidateTerm.getContext()),
sourceTerm);
dicoCandidates.add(candidate);
}
}
}
}
return dicoCandidates;
}
}
public boolean canAlignCompositional(TermService sourceTerm) {
return getSourceSingleLemmaTerms(sourceTerm)
.stream()
.anyMatch(slTerms -> slTerms.size() == 2);
}
public List alignCompositional(TermService sourceTerm, int nbCandidates, int minCandidateFrequency) {
Preconditions.checkArgument(canAlignCompositional(sourceTerm), "Cannot align <%s> with compositional method", sourceTerm);
List> singleLemmaTermSets = getSourceSingleLemmaTerms(sourceTerm);
List candidates = Lists.newArrayList();
for(List singleLemmaTerms:singleLemmaTermSets) {
if(singleLemmaTerms.size() == 2) {
candidates.addAll(alignCompositionalSize2(
singleLemmaTerms.get(0),
singleLemmaTerms.get(1),
nbCandidates,
minCandidateFrequency,
sourceTerm));
}
}
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, candidates);
}
public boolean canAlignSemiDistributional(TermService sourceTerm) {
return getSourceSingleLemmaTerms(sourceTerm)
.stream()
.anyMatch(slTerms -> slTerms.size() == 2);
}
public List alignSemiDistributional(TermService sourceTerm, int nbCandidates, int minCandidateFrequency) {
Preconditions.checkArgument(canAlignCompositional(sourceTerm), "Cannot align <%s> with compositional method", sourceTerm);
List> singleLemmaTermSets = getSourceSingleLemmaTerms(sourceTerm);
List candidates = Lists.newArrayList();
for(List singleLemmaTerms:singleLemmaTermSets) {
if(singleLemmaTerms.size() == 2) {
candidates.addAll(alignSemiDistributionalSize2Syntagmatic(
singleLemmaTerms.get(0),
singleLemmaTerms.get(1),
nbCandidates,
minCandidateFrequency,
sourceTerm));
}
}
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, candidates);
}
public List alignCompositionalSize2(TermService lemmaTerm1, TermService lemmaTerm2, int nbCandidates, int minCandidateFrequency, TermService sourceTerm) {
List candidates = Lists.newArrayList();
List dicoCandidates1 = alignDico(lemmaTerm1, Integer.MAX_VALUE);
List dicoCandidates2 = alignDico(lemmaTerm2, Integer.MAX_VALUE);
candidates.addAll(combineSWTCandidates(dicoCandidates1, dicoCandidates2, sourceTerm));
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, candidates);
}
private AllComponentPairsProvider allComponentPairsProvider = new AllComponentPairsProvider();
/**
* Join to lists of swt candidates and use the specificities (wrLog)
* of the combine terms as the candidate scores.
*
* FIXME Bad way of scoring candidates. They should be scored by similarity of context vectors with the source context vector
*
* @param candidates1
* @param candidates2
* @return
*/
private Collection combineSWTCandidates(Collection candidates1,
Collection candidates2, Object sourceTerm) {
Collection combination = Sets.newHashSet();
for(TranslationCandidate candidate1:candidates1) {
for(TranslationCandidate candidate2:candidates2) {
/*
* 1- create candidate combine terms
*/
String key1 = candidate1.getTerm().getLemma() + "+" + candidate2.getTerm().getLemma();
List candidateCombinedTerms = targetIndexes.getIndex(TermIndexType.ALLCOMP_PAIRS).getTerms(key1);
String key2 = candidate2.getTerm().getLemma() + "+" + candidate1.getTerm().getLemma();
candidateCombinedTerms.addAll(targetIndexes.getIndex(TermIndexType.ALLCOMP_PAIRS).getTerms(key2));
if(candidateCombinedTerms.isEmpty())
continue;
/*
* 2- Avoids retrieving too long terms by keeping the ones that have
* the lowest number of lemma+lemma keys.
*/
final Map> termLemmaLemmaKeys = Maps.newHashMap();
for(Term t:candidateCombinedTerms)
termLemmaLemmaKeys.put(t, allComponentPairsProvider.getClasses(t));
Collections.sort(candidateCombinedTerms, new Comparator() {
@Override
public int compare(Term o1, Term o2) {
return Integer.compare(termLemmaLemmaKeys.get(o1).size(), termLemmaLemmaKeys.get(o2).size());
}
});
List filteredTerms = Lists.newArrayList();
int minimumNbClasses = termLemmaLemmaKeys.get(candidateCombinedTerms.get(0)).size();
for(Term t:candidateCombinedTerms) {
if(termLemmaLemmaKeys.get(t).size() == minimumNbClasses)
filteredTerms.add(t);
else
break;
}
/*
* 3- Create candidates from filtered terms
*/
for(Term t:filteredTerms) {
TranslationCandidate combinedCandidate = new TranslationCandidate(
getCombinedMethod(candidate1, candidate2),
t,
t.getSpecificity(), // TODO Not by specificity, by distribution !!
sourceTerm,
candidate1, candidate2
);
combinedCandidate.setExplanation(new TextExplanation(String.format("Spécificité: %.1f", t.getSpecificity())));
combination.add(combinedCandidate);
}
}
}
return combination;
}
private Collection combineMWTCandidates(Collection candidates1,
Collection candidates2, Object sourceTerm) {
ensureHasExtensionRelationsComputred(targetTermino);
Collection combinations = Sets.newHashSet();
for(TranslationCandidate candidate1:candidates1) {
Collection extensions1 = targetTermino.extensions(candidate1.getTerm())
.map(RelationService::getTo)
.collect(toSet());
for(TranslationCandidate candidate2:candidates2) {
Set commonExtensions = targetTermino.extensions(candidate2.getTerm())
.map(RelationService::getTo)
.filter(ext-> extensions1.contains(ext))
.collect(Collectors.toSet());
Optional minSize = commonExtensions.stream().map(t->t.getWords().size()).sorted().findFirst();
if(minSize.isPresent()) {
commonExtensions.stream().filter(t->t.getWords().size() == minSize.get()).forEach(targetTerm-> {
combinations.add(new TranslationCandidate(
AlignmentMethod.COMPOSITIONAL,
targetTerm.getTerm(),
candidate1.getScore()*candidate2.getScore(),
sourceTerm,
candidate1, candidate2
));
});
}
}
}
return combinations;
}
private boolean ensuredExtensionsAreComputed = false;
private void ensureHasExtensionRelationsComputred(TerminologyService termino) {
if(!ensuredExtensionsAreComputed) {
if(!termino.extensions().findAny().isPresent())
throw new IllegalStateException(String.format("No %s relation found in termino %s", RelationType.HAS_EXTENSION, termino));
ensuredExtensionsAreComputed = true;
}
}
private AlignmentMethod getCombinedMethod(TranslationCandidate candidate1, TranslationCandidate candidate2) {
if(candidate1.getMethod() == AlignmentMethod.DISTRIBUTIONAL || candidate1.getMethod() == AlignmentMethod.SEMI_DISTRIBUTIONAL)
return AlignmentMethod.SEMI_DISTRIBUTIONAL;
else if(candidate2.getMethod() == AlignmentMethod.DISTRIBUTIONAL || candidate2.getMethod() == AlignmentMethod.SEMI_DISTRIBUTIONAL)
return AlignmentMethod.SEMI_DISTRIBUTIONAL;
else
return AlignmentMethod.COMPOSITIONAL;
}
private void checkNotNull(TermService sourceTerm) {
Preconditions.checkNotNull(sourceTerm, MSG_TERM_NOT_NULL);
}
public List alignSemiDistributionalSize2Syntagmatic(TermService lemmaTerm1, TermService lemmaTerm2, int nbCandidates, int minCandidateFrequency, TermService sourceTerm) {
List candidates = Lists.newArrayList();
Collection extends TranslationCandidate> t1 = semiDistributional(lemmaTerm1, lemmaTerm2, sourceTerm);
candidates.addAll(t1);
Collection extends TranslationCandidate> t2 = semiDistributional(lemmaTerm2, lemmaTerm1, sourceTerm);
candidates.addAll(t2);
removeDuplicatesOnTerm(candidates);
return sortTruncateNormalizeAndMerge(targetTermino, nbCandidates, candidates);
}
private void removeDuplicatesOnTerm(List candidates) {
Set set = Sets.newHashSet();
Iterator it = candidates.iterator();
while(it.hasNext())
if(!set.add(it.next().getTerm()))
it.remove();
}
private Collection extends TranslationCandidate> semiDistributional(TermService dicoTerm, TermService vectorTerm, TermService sourceTerm) {
List candidates = Lists.newArrayList();
List dicoCandidates = alignDico(dicoTerm, Integer.MAX_VALUE);
if(dicoCandidates.isEmpty())
// Optimisation: no need to align since there is no possible combination
return candidates;
else {
List vectorCandidates = alignDicoThenDistributional(vectorTerm, Integer.MAX_VALUE, 1);
return combineSWTCandidates(dicoCandidates, vectorCandidates, sourceTerm);
}
}
private void normalizeCandidateScores(List candidates) {
double sum = 0;
for(TranslationCandidate cand:candidates)
sum+= cand.getScore();
if(sum > 0d)
for(TranslationCandidate cand:candidates)
cand.setScore(cand.getScore()/sum);
}
public BilingualDictionary getDico() {
return this.dico;
}
public static final int TRANSLATION_STRATEGY_PRORATA = 1;
public static final int TRANSLATION_STRATEGY_MOST_FREQUENT = 2;
public static final int TRANSLATION_STRATEGY_MOST_SPECIFIC = 3;
private static final int TRANSLATION_STRATEGY_EQUI_REPARTITION = 4;
/**
*
* Translates all {@link ContextVector} components (i.e. its coTerms) into
* the target language of this aligner by the mean of one of the available
* strategy :
* - {@link TRANSLATION_STRATEGY_MOST_FREQUENT}
* - {@link TRANSLATION_STRATEGY_PRORATA}
* - {@link TRANSLATION_STRATEGY_EQUI_REPARTITION}
* - {@link TRANSLATION_STRATEGY_MOST_SPECIFIC}
*
* @see BilingualDictionary
* @param sourceVector
* The source context vector object to be translated into target language
* @param dictionary
* The dico used in the translation process
* @param translationStrategy
* The translation strategy of the sourceVector.
* Two possible values: {@link TRANSLATION_STRATEGY_MOST_FREQUENT}
* {@link TRANSLATION_STRATEGY_PRORATA}
* {@link TRANSLATION_STRATEGY_EQUI_REPARTITION}
* {@link TRANSLATION_STRATEGY_MOST_SPECIFIC}
* @return
* The translated context vector
*/
public ContextVector translateVector(ContextVector sourceVector,
BilingualDictionary dictionary, int translationStrategy, TerminologyService targetTermino) {
ContextVector targetVector = new ContextVector();
for(ContextVector.Entry entry:sourceVector.getEntries()) {
Set translations = Sets.newHashSet();
for(String targetLemma:dictionary.getTranslations(entry.getCoTerm().getLemma())) {
Collection translatedTerms = targetIndexes.getIndex(TermIndexType.SWT_LEMMAS_SWT_TERMS_ONLY).getTerms(targetLemma);
if(!translatedTerms.isEmpty())
translations.add(translatedTerms.iterator().next());
}
switch (translationStrategy) {
case TRANSLATION_STRATEGY_PRORATA:
fillTargetVectorSProrata(targetVector, entry, translations);
break;
case TRANSLATION_STRATEGY_MOST_FREQUENT:
fillTargetVectorSMost(targetVector, entry, translations, TermProperty.FREQUENCY);
break;
case TRANSLATION_STRATEGY_MOST_SPECIFIC:
fillTargetVectorSMost(targetVector, entry, translations, TermProperty.SPECIFICITY);
break;
case TRANSLATION_STRATEGY_EQUI_REPARTITION:
fillTargetVectorSEquiRepartition(targetVector, entry, translations);
break;
default:
throw new IllegalArgumentException("Invalid translation strategy: " + translationStrategy);
}
}
return targetVector;
}
/**
* This method implements the strategy {@link #TRANSLATION_STRATEGY_PRORATA}
* for context vector translation.
*
* Explanation of strategy:
*
* Example of source term in french : chat
*
* Example of candidate translations for "noir" from dico: black, dark
* Example of candidate translations for "chien" from dico: dog
*
* Suppose that frequencies in target term index are :
* - black : 35
* - dark : 15
* - dog : 7
*
* The translated vector would be :
*
* because :
* - total frequency in target term index for term "noir" is 35 + 15 = 50,
* and 7 = ( 35 / 50 ) * 10 for "black"
* and 3 = ( 15 / 50 ) * 10 for "dark"
* - total frequency in target term index for term "dog" is 7,
* and 3 = ( 7 / 7 ) * 3
*
*
* @param translatedVector
* the target vector to be fill
* @param sourceTermEntry
* the source vector's component to translated and add to target vector
* @param candidateTranslations
* the candidate translations of the sourceTermEntry given by the
* bilingual dictionary.
*/
private static void fillTargetVectorSProrata(ContextVector translatedVector,
ContextVector.Entry sourceTermEntry, Set candidateTranslations) {
/*
* Do the cross product of translation frequencies
*/
int totalFreqInTargetTermino = 0;
for(Term tt : candidateTranslations)
totalFreqInTargetTermino += tt.getFrequency();
for(Term targetTerm:candidateTranslations) {
int prorataCooccs = targetTerm.getFrequency() * sourceTermEntry.getNbCooccs() / totalFreqInTargetTermino;
translatedVector.addEntry(targetTerm, prorataCooccs, sourceTermEntry.getAssocRate());
}
}
/**
* This method implements the {@value #TRANSLATION_STRATEGY_MOST_FREQUENT}
* strategy for context vector translation.
*
*
* Explanation of strategy:
*
* Example of source term in french : chat
*
* Example of candidate translations for "noir" from dico: black, dark
* Example of candidate translations for "chien" from dico: dog
*
* Suppose that frequencies in target term index are :
* - black : 35
* - dark : 15
* - dog : 7
*
* The translated vector would be :
*
* @param translatedVector
* the target vector to be fill
* @param sourceTermEntry
* the source vector's component to translated and add to target vector
* @param candidateTranslations
* the candidate translations of the sourceTermEntry given by the
* bilingual dictionary.
* @param termMeasure
*
*/
private static void fillTargetVectorSMost(ContextVector translatedVector,
ContextVector.Entry sourceTermEntry, Set candidateTranslations, TermProperty termProperty) {
fillTargetVectorWithMostProperty(translatedVector, sourceTermEntry,
candidateTranslations, termProperty);
}
/**
*
* Explanation of strategy:
*
* Example of source term in french : chat
*
* Example of candidate translations for "noir" from dico: black, dark
* Example of candidate translations for "chien" from dico: dog
*
*
* The translated vector would be :
*
* @param translatedVector
* @param sourceTermEntry
* @param candidateTranslations
*/
private static void fillTargetVectorSEquiRepartition(ContextVector translatedVector,
ContextVector.Entry sourceTermEntry, Set candidateTranslations) {
/*
* Do the cross product of translation frequencies
*/
for(Term targetTerm:candidateTranslations) {
int nbCooccs = sourceTermEntry.getNbCooccs()/candidateTranslations.size();
translatedVector.addEntry(
targetTerm,
nbCooccs,
sourceTermEntry.getAssocRate()/candidateTranslations.size());
}
}
private static void fillTargetVectorWithMostProperty(
ContextVector translatedVector,
ContextVector.Entry sourceTermEntry,
Set candidateTranslations, final TermProperty termProperty) {
Preconditions.checkArgument(termProperty.isNumeric());
Term mostFrequent = null;
double maxValue = -1d;
for(Term t:candidateTranslations) {
if(t.isNumericValueGT(termProperty, maxValue)) {
maxValue = t.getFrequency();
mostFrequent = t;
}
}
if(mostFrequent != null)
/*
* mostFrequent would be null if candidateTranslations is empty
*/
translatedVector.addEntry(mostFrequent, sourceTermEntry.getNbCooccs(), sourceTermEntry.getAssocRate());
}
/**
*
* Gives the list of all possible single lemma terms decompositino for a complex term.
*
*
* @param termino
* @param term
* @return
*/
public List> getSourceSingleLemmaTerms(TermService term) {
List swtTerms = term.getSwts().collect(toList());
List> lemmaSets = Lists.newArrayList();
if(swtTerms.size() == 1) {
if(term.getWords().size() > 1) {
LOGGER.warn("Could not apply single lemma term decomposition for term {}. Expected at least two inner swt terms, but got {}", term, swtTerms);
return Lists.newArrayList();
}
// sourceTerm is swtTerms.get(0);
if(term.isCompound()) {
lemmaSets.add(Lists.newArrayList(term));
for(Pair pair:CompoundUtils.innerContiguousComponentPairs(term.getWords().get(0).getWord())) {
for(Term swt1:getSwtSetFromComponent(sourceIndexes.getIndex(TermIndexType.LEMMA_LOWER_CASE), pair.getElement1())) {
for(Term swt2:getSwtSetFromComponent(sourceIndexes.getIndex(TermIndexType.LEMMA_LOWER_CASE), pair.getElement2())) {
Pair pair2 = new Pair(swt1, swt2);
lemmaSets.add(pair2.toList().stream()
.map(t->sourceTermino.asTermService(t))
.collect(toList()));
}
}
}
} else {
lemmaSets.add(Lists.newArrayList(term));
}
} else {
if(swtTerms.size() == 2) {
lemmaSets.add(swtTerms);
} else
throw new RequiresSize2Exception(term.getTerm(), swtTerms.stream().map(TermService::getTerm).collect(toList()));
}
return lemmaSets;
}
public static Set getSwtSetFromComponent(TermIndex lemmaLowerCaseIndex, Component c) {
Set terms = new HashSet<>();
terms.addAll(lemmaLowerCaseIndex.getTerms(c.getLemma()));
terms.addAll(lemmaLowerCaseIndex.getTerms(c.getSubstring()));
return terms;
}
}