eu.project.ttc.utils.TermUtils Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.utils;

import java.io.PrintStream;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import com.google.common.base.Optional;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.engines.desc.TermSuiteResourceException;
import eu.project.ttc.models.ContextVector;
import eu.project.ttc.models.LemmaStemHolder;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.TermWord;
import eu.project.ttc.models.index.CustomTermIndex;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.models.index.TermMeasure;
import eu.project.ttc.models.index.TermValueProviders;
import eu.project.ttc.resources.GeneralLanguageResource;
import eu.project.ttc.tools.TermSuiteResourceHelper;

public class TermUtils {

	private static final String MSG_NOT_AN_EXTENSION = "Term '%s' is no extension of term '%s'";
	private static final String MSG_NOT_AN_AFFIX = "Term '%s' is contained into term '%s', but not an affix.";


	/**
	 * Most frequent first
	 */
	public static Comparator frequencyComparator = new Comparator() {
		@Override
		public int compare(Term o1, Term o2) {
			return ComparisonChain.start()
					.compare(o2.getFrequency(), o1.getFrequency())
					.result();
		}
	};
	
	public static TermFormGetter formGetter(TermIndex termIndex, boolean downcaseForms) {
		return new TermFormGetter(termIndex, downcaseForms);
	}
	
	public static void showIndex(TermIndex index, PrintStream stream) {
		Optional watchExpression = Optional.absent();
		showIndex(index, stream, watchExpression);
	}
		
	public static void showIndex(TermIndex index, PrintStream stream, Optional watchExpression) {
		for(Term term:index.getTerms()) {
			if(!watchExpression.isPresent()
					|| (watchExpression.isPresent() && watchExpression.get().matcher(term.getGroupingKey()).find())
					) {
				stream.println(term);
//				for(Term t:term.getGraphicalVariants()) 
//					stream.format("\tgraphical: %s\n" , t.getGroupingKey());
				for(TermVariation variation:term.getVariations()) 
					stream.format("\tsyntactic: %s\n" , variation.getVariant().getGroupingKey());
			}
		}
	}

	public static void showTopNTermsBy(TermIndex index, TermMeasure measure, PrintStream out, int n) {
		List terms = Lists.newArrayList(index.getTerms());
		Collections.sort(terms, measure.getTermComparator(true));
		int i = 0;
		for(Term t:terms) {
			out.println(t);
			if(i++ > n)
				break;
		}
	}

	public static void showCompounds(TermIndex index, PrintStream out, int threshhold) {
		List terms = Lists.newArrayList();
		for(Term term:index.getTerms()) {
			if(term.isCompound() && term.getFrequency() >= threshhold)
				terms.add(term);
		}
		Collections.sort(terms, frequencyComparator);
		for(Term term:terms) 
			out.println(term);
	}
	
	/**
	 * 
	 * Finds in an input term all single-word terms it is made off. 
	 * If the input term has compounds, this method will iterate 
	 * over each compound and try to find a matching swt for each compound.
	 * 
	 * This method creates an index on TermIndex based on key
	 * {@link TermIndexes#SINGLE_WORD_LEMMA}.
	 * 
	 * @param termIndex
	 * 			The {@link TermIndex} in which single word terms must be found.
	 * @param term
	 * 			The input term.
	 * @param compoundLevel
	 * 			The compoundLevel param passed to {@link Term#asComponentIterator(boolean)}.
	 * @return
	 * 			The list of single word terms.
	 * 
	 * @see Term#asComponentIterator(boolean)
	 */
	public static List getSingleWordTerms(TermIndex termIndex, Term term, boolean compoundLevel) {
		CustomTermIndex index = termIndex.createCustomIndex(TermIndexes.SINGLE_WORD_LEMMA, TermValueProviders.get(TermIndexes.SINGLE_WORD_LEMMA));
		List terms = Lists.newArrayList();
		Iterator it = term.asComponentIterator(compoundLevel);
		LemmaStemHolder lemmaStemHolder;
		while (it.hasNext()) {
			lemmaStemHolder = (LemmaStemHolder) it.next();
			List swtTerms = index.getTerms(lemmaStemHolder.getLemma());
			if(swtTerms.size() > 0)
				terms.add(swtTerms.get(0));
		}
		return terms;
	}


	public static String collapseText(String coveredText) {
		char[] charArray = coveredText.toCharArray();
		if(charArray.length == 0)
			return "";
		char last = charArray[0];
		StringBuilder builder = new StringBuilder();
		builder.append(last);
		for(int i=1;i entries = Sets.newTreeSet(contextVector.getEntries());
		int i = 0;
		for(ContextVector.Entry e:entries) {
			i++;
			if(i>topN)
				break;
			System.out.format("\t%-12s: %d\n", e.getCoTerm().getLemma(), e.getNbCooccs());
		}
	}

	/**
	 * Returns the strictness of t1 based on t2, i.e. the ratio of appearance
	 * in an occurrence that do not overlap with t2. 
	 * 
	 * @param t1
	 * 			the term to analyze
	 * @param t2
	 * 			the base term
	 * @return
	 * 			fstrict(t1) / f(t1)
	 */
	public static double getStrictness(Term t1, Term t2) {
		Collection occ1 = Lists.newArrayList(t1.getOccurrences());
		TermOccurrenceUtils.removeOverlaps(t2.getOccurrences(), occ1);
		double t1Strict = occ1.size();
		double t1F = t1.getFrequency();
		return t1Strict / t1F;
	}
	
	
	/**
	 * 
	 * Finds in a {@link TermIndex} the biggest extension affix term of a term depending 
	 * on a base term.
	 * 
	 * For example, the term "offshore wind turbine" is an extension of 
	 * "wind turbine". The extension affix is the term "offshore".
	 * 
	 * @param termIndex
	 * 			The term index that both terms belong to.
	 * @param base
	 * 			The base term
	 * @param extension
	 * 			The extension term
	 * @return
	 * 		the extension affix found in termIndex, null if none
	 * 		has been found.
	 * @throws IllegalArgumentException if extension id not an 
	 * 			extension of the term base.
	 */
	public static Term getExtensionAffix(TermIndex termIndex, Term base, Term extension) {
		int index = TermUtils.getPosition(base, extension);
		if(index == -1)
			throw new IllegalStateException(String.format(MSG_NOT_AN_EXTENSION, 
					extension,
					base)
				);

		/*
		 *  true if prefix, false if suffix
		 */
		boolean isPrefix = false;
		if(index == 0)
			isPrefix = true;
		else if(index + base.getWords().size() == extension.getWords().size())
			isPrefix = false; // suffix
		else {
			throw new IllegalStateException(String.format(MSG_NOT_AN_AFFIX, 
					extension,
					base)
				);
		}
		
		if(isPrefix) 
			return findBiggestSuffix(
					termIndex, 
					extension.getWords().subList(index + base.getWords().size(), extension.getWords().size())
				);
		else
			return findBiggestPrefix(
					termIndex, 
					extension.getWords().subList(0, index)
				);
	}

	/**
	 * Finds in a {@link TermIndex} the biggest prefix of a sequence of
	 * {@link TermWord}s that exists as a term.
	 * 
	 * @param termIndex
	 * 			the term index
	 * @param words
	 * 			the initial sequence of {@link TermWord}s
	 * @return
	 * 			A {@link Term} found in termIndex that makes the
	 * 			biggest possible prefix sequence for words.
	 */
	public static Term findBiggestPrefix(TermIndex termIndex, List words) {
		Term t;
		String gKey;
		for(int i = words.size(); i > 0 ; i--) {
			gKey = TermSuiteUtils.getGroupingKey(words.subList(0, i));
			t = termIndex.getTermByGroupingKey(gKey);
			if(t!=null)
				return t;
		}
		return null;
	}
	

	/**
	 * Finds in a {@link TermIndex} the biggest suffix of a sequence of
	 * {@link TermWord}s that exists as a term.
	 * 
	 * @param termIndex
	 * 			the term index
	 * @param words
	 * 			the initial sequence of {@link TermWord}s
	 * @return
	 * 			A {@link Term} found in termIndex that makes the
	 * 			biggest possible suffix sequence for words.

	 */
	public static Term findBiggestSuffix(TermIndex termIndex, List words) {
		Term t;
		String gKey;
		for(int i = 0; i < words.size() ; i++) {
			gKey = TermSuiteUtils.getGroupingKey(words.subList(i, words.size()));
			t = termIndex.getTermByGroupingKey(gKey);
			if(t!=null)
				return t;
		}
		return null;
	}
	
	public static boolean isIncludedIn(Term term, Term inTerm) {
		return getPosition(term, inTerm) != -1;
	}

	public static boolean isPrefixOf(Term term, Term ofTerm) {
		return getPosition(term, ofTerm) == 0;		
	}

	public static boolean isSuffixOf(Term term, Term ofTerm) {
		return getPosition(term, ofTerm) + term.getWords().size() == ofTerm.getWords().size();				
	}

	
	/**
	 * Finds the index of appearance of a term's sub-term.
	 * 
	 * 
	 * @param subTerm
	 * 			the inner term, must be included in term
	 * @param term
	 * 			the container term.
	 * @return
	 * 			the starting index of subTerm in term. -1 otherwise.
	 */
	public static int getPosition(Term subTerm, Term term) {
		int startingIndex = -1;
		int j = 0;
		for(int i=0; i