fr.univnantes.termsuite.utils.TermUtils Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package fr.univnantes.termsuite.utils;

import static java.util.stream.Collectors.joining;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;

import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import fr.univnantes.termsuite.engines.splitter.CompoundUtils;
import fr.univnantes.termsuite.model.Component;
import fr.univnantes.termsuite.model.Term;
import fr.univnantes.termsuite.model.TermWord;
import fr.univnantes.termsuite.model.Word;

public class TermUtils {


	/**
	 * Most frequent first
	 */
	public static Comparator frequencyComparator = new Comparator() {
		@Override
		public int compare(Term o1, Term o2) {
			return ComparisonChain.start()
					.compare(o2.getFrequency(), o1.getFrequency())
					.result();
		}
	};

	private static final String STEMMED_INSENSITIVE_GKEY_FORMAT = "%s: %s";
	/**
	 * e.g. a: Hydroélectrique -> a: hydroelectric
	 */
	public static String stemmedInsensitiveGroupingKey(TermWord termWord) {
		return StringUtils.replaceAccents(String.format(
				STEMMED_INSENSITIVE_GKEY_FORMAT, 
				termWord.getSyntacticLabel(), 
				termWord.getWord().getStem()).toLowerCase());
	}
	
	/**
	 * e.g. a: Hydroélectrique -> a: hydroelectrique
	 */
	public static String lemmatizedInsensitiveGroupingKey(TermWord termWord) {
		return StringUtils.replaceAccents(String.format(
				STEMMED_INSENSITIVE_GKEY_FORMAT, 
				termWord.getSyntacticLabel(), 
				termWord.getWord().getLemma()).toLowerCase());		
	}



	public static String collapseText(String coveredText) {
		char[] charArray = coveredText.toCharArray();
		if(charArray.length == 0)
			return "";
		char last = charArray[0];
		StringBuilder builder = new StringBuilder();
		builder.append(last);
		for(int i=1;iterm
	 * @param term
	 * 			the container term.
	 * @return
	 * 			the starting index of subTerm in term. -1 otherwise.
	 */
	public static int getPosition(Term subTerm, Term term) {
		int startingIndex = -1;
		int j = 0;
		for(int i=0; i> toComponentSets(Iterable words) {
		List> sets = Lists.newArrayList();
		for(Word w:words) {
			if(w.isCompound())
				sets.add(Sets.newHashSet(CompoundUtils.allSizeComponents(w)));
			else {
				sets.add(Sets.newHashSet(new Component(0, w.getLemma().length(), w.getLemma())));
			}
		}
		return sets;
	}
	
	
	/**
	 * Return the term pair indexing key that is compliant with {@link TermValueProviders#ALLCOMP_PAIRS}.
	 * 
	 * @see TermValueProviders#ALLCOMP_PAIRS
	 * @param term1
	 * 				First term of the pair
	 * @param term2
	 * 				Second term of the pair
	 * @return
	 * 			The indexing key for the given pair
	 */
	public static String getLemmaLemmaKey(Term term1, Term term2) {
		List lemmas = new ArrayList<>(2);
		lemmas.add(term1.getWords().get(0).getWord().getLemma());
		lemmas.add(term2.getWords().get(0).getWord().getLemma());
		Collections.sort(lemmas);
		return String.format("%s+%s", lemmas.get(0), lemmas.get(1));
	}

	public static boolean isCompound(Term actual) {
		return actual.getWords().size() == 1 
				&& actual.getWords().get(0).getWord().isCompound();
	}

	public static String getTermLemma(Term t) {
		return t.getWords().stream()
			.map(TermWord::getWord)
			.map(Word::getLemma)
			.collect(joining(TermSuiteConstants.WHITESPACE_STRING));
	}

}