fr.univnantes.termsuite.utils.TermUtils Maven / Gradle / Ivy
/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package fr.univnantes.termsuite.utils; import static java.util.stream.Collectors.joining; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import fr.univnantes.termsuite.engines.splitter.CompoundUtils; import fr.univnantes.termsuite.model.Component; import fr.univnantes.termsuite.model.Term; import fr.univnantes.termsuite.model.TermWord; import fr.univnantes.termsuite.model.Word; public class TermUtils { /** * Most frequent first */ public static Comparator
* @param term * the container term. * @return * the starting index offrequencyComparator = new Comparator () { @Override public int compare(Term o1, Term o2) { return ComparisonChain.start() .compare(o2.getFrequency(), o1.getFrequency()) .result(); } }; private static final String STEMMED_INSENSITIVE_GKEY_FORMAT = "%s: %s"; /** * e.g. a: Hydroélectrique -> a: hydroelectric */ public static String stemmedInsensitiveGroupingKey(TermWord termWord) { return StringUtils.replaceAccents(String.format( STEMMED_INSENSITIVE_GKEY_FORMAT, termWord.getSyntacticLabel(), termWord.getWord().getStem()).toLowerCase()); } /** * e.g. a: Hydroélectrique -> a: hydroelectrique */ public static String lemmatizedInsensitiveGroupingKey(TermWord termWord) { return StringUtils.replaceAccents(String.format( STEMMED_INSENSITIVE_GKEY_FORMAT, termWord.getSyntacticLabel(), termWord.getWord().getLemma()).toLowerCase()); } public static String collapseText(String coveredText) { char[] charArray = coveredText.toCharArray(); if(charArray.length == 0) return ""; char last = charArray[0]; StringBuilder builder = new StringBuilder(); builder.append(last); for(int i=1;i term subTerm
interm
. -1 otherwise. */ public static int getPosition(Term subTerm, Term term) { int startingIndex = -1; int j = 0; for(int i=0; i> toComponentSets(Iterable words) { List > sets = Lists.newArrayList(); for(Word w:words) { if(w.isCompound()) sets.add(Sets.newHashSet(CompoundUtils.allSizeComponents(w))); else { sets.add(Sets.newHashSet(new Component(0, w.getLemma().length(), w.getLemma()))); } } return sets; } /** * Return the term pair indexing key that is compliant with {@link TermValueProviders#ALLCOMP_PAIRS}. * * @see TermValueProviders#ALLCOMP_PAIRS * @param term1 * First term of the pair * @param term2 * Second term of the pair * @return * The indexing key for the given pair */ public static String getLemmaLemmaKey(Term term1, Term term2) { List lemmas = new ArrayList<>(2); lemmas.add(term1.getWords().get(0).getWord().getLemma()); lemmas.add(term2.getWords().get(0).getWord().getLemma()); Collections.sort(lemmas); return String.format("%s+%s", lemmas.get(0), lemmas.get(1)); } public static boolean isCompound(Term actual) { return actual.getWords().size() == 1 && actual.getWords().get(0).getWord().isCompound(); } public static String getTermLemma(Term t) { return t.getWords().stream() .map(TermWord::getWord) .map(Word::getLemma) .collect(joining(TermSuiteConstants.WHITESPACE_STRING)); } }