All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.project.ttc.models.index.TermValueProviders Maven / Gradle / Ivy

Go to download

A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment

There is a newer version: 3.0.10
Show newest version
/*******************************************************************************
 * Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.models.index;

import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import eu.project.ttc.models.Component;
import eu.project.ttc.models.LemmaStemHolder;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermWord;
import eu.project.ttc.utils.TermSuiteConstants;

public class TermValueProviders {
	private static final Logger LOGGER = LoggerFactory.getLogger(TermValueProviders.class);
	
	public static final TermValueProvider TERM_SINGLE_WORD_LEMMA_PROVIDER = new AbstractTermValueProvider(TermIndexes.SINGLE_WORD_LEMMA) {
		@Override
		public Collection getClasses(Term term) {
			if(term.isSingleWord())
				return Lists.newArrayList(term.getWords().get(0).getWord().getLemma());
			return null;
		}
	};

	public static final TermValueProvider TERM_LEMMA_LOWER_CASE_PROVIDER = new AbstractTermValueProvider(TermIndexes.LEMMA_LOWER_CASE) {
		public java.util.Collection getClasses(Term term) {
			return ImmutableList.of(term.getLemma().toLowerCase());
		};
	};

	public static final TermValueProvider TERM_NOCLASS_PROVIDER = new AbstractTermValueProvider(TermIndexes.TERM_NOCLASS) {
		private String value = "noclass";
		public java.util.Collection getClasses(Term term) {
			return ImmutableList.of(value);
		};
	};

	public static final TermValueProvider WORD_LEMMA_PROVIDER = new AbstractTermValueProvider(TermIndexes.WORD_LEMMA) {
		@Override
		public Collection getClasses(Term term) {
			List lemmas = Lists.newArrayListWithCapacity(term.getWords().size());
			Iterator it = term.asComponentIterator();
			LemmaStemHolder c;
			while(it.hasNext()) {
				c = it.next();
				if(c.getLemma() == null) {
					LOGGER.warn("Lemma is null for " + c);
				} else {
					if(c.isLemmaSet())
						lemmas.add(c.getLemma());
					else
						lemmas.add(NO_LEMMA_SET);
				}
				
			}
			return lemmas;
		}
	};

	
	public static final TermValueProvider WORD_LEMMA_STEM_PROVIDER = new AbstractTermValueProvider(TermIndexes.WORD_COUPLE_LEMMA_STEM) {

		@Override
		public Collection getClasses(Term term) {
			List lemmas = Lists.newArrayListWithCapacity(term.getWords().size());
			
			Map stems = new HashMap();
			for(TermWord w:term.getWords()) {
				if (w.getWord().getLemma() == null || w.getWord().getLemma().isEmpty()) {
					LOGGER.warn("lemma is null or empty: " + w);
					continue;
				} else if(TermSuiteConstants.TERM_MATCHER_LABELS.contains(w.getSyntacticLabel())) {
						lemmas.add(w.getWord().getNormalizedLemma());
					if(w.getWord().getStem() == null || w.getWord().getStem().isEmpty()) {
						LOGGER.warn("stem is null or empty: " + w);
					} else 
						stems.put(w.getWord().getNormalizedLemma(), w.getWord().getNormalizedStem());
				}
			}	
			Collections.sort(lemmas);
			List keys = Lists.newArrayListWithCapacity(lemmas.size());
			
			for (int i = 0 ; i < lemmas.size(); i++) {
				for (int j = i + 1; j < lemmas.size(); j++) {
					StringBuilder sb = new StringBuilder();
					sb.append(lemmas.get(i));
					sb.append(TermSuiteConstants.PLUS);
					sb.append(stems.get(lemmas.get(j)));
					keys.add(sb.toString());
				}
			}
			

			return keys;
		}
	};
	protected static final String NO_LEMMA_SET = "__no_lemma_set__";
		
	public static final TermValueProvider WORD_LEMMA_LEMMA_PROVIDER = new AbstractTermValueProvider(TermIndexes.WORD_COUPLE_LEMMA_LEMMA) {

		@Override
		public Collection getClasses(Term term) {
			List lemmas = Lists.newArrayListWithCapacity(term.getWords().size());
			for(TermWord w:term.getWords()) {
				if (w.getWord().getLemma() == null || w.getWord().getLemma().isEmpty()) {
					LOGGER.warn("lemma is null or empty: " + w);
					continue;
				} else if(TermSuiteConstants.TERM_MATCHER_LABELS.contains(w.getSyntacticLabel())) {
					lemmas.add(w.getWord().getLemma());
					if(w.getWord().isCompound()) {
						for(Component c:w.getWord().getComponents()) {
							if(c.isLemmaSet())
								lemmas.add(c.getLemma());
							else 
								lemmas.add(NO_LEMMA_SET);
								
						}
					}
				}
			}	
			Collections.sort(lemmas);
			List keys = Lists.newArrayListWithCapacity((lemmas.size()*(lemmas.size()-1))/2);

			for (int i = 0 ; i < lemmas.size(); i++) {
				for (int j = i + 1; j < lemmas.size(); j++) {
					StringBuilder sb = new StringBuilder();
					sb.append(lemmas.get(i));
					sb.append(TermSuiteConstants.PLUS);
					sb.append(lemmas.get(j));
					keys.add(sb.toString());
				}
			}
			
			return keys;
			
		}
	};
	
	public static final TermSingleValueProvider WORD_LEMMA_LOWER_CASE = new TermSingleValueProvider(TermIndexes.WORD_LEMMA_LOWER_CASE) {
		@Override
		public String getClass(Term term) {
			return Character.toString(Character.toLowerCase(term.getGroupingKey().charAt(0)));
		}
	};
	
	private static final Map valueProviders = Maps.newHashMap();

	static {
		valueProviders.put(TermIndexes.SINGLE_WORD_LEMMA, TERM_SINGLE_WORD_LEMMA_PROVIDER);
		valueProviders.put(TermIndexes.TERM_NOCLASS, TERM_NOCLASS_PROVIDER);
		valueProviders.put(TermIndexes.LEMMA_LOWER_CASE, TERM_LEMMA_LOWER_CASE_PROVIDER);
		valueProviders.put(TermIndexes.WORD_LEMMA_LOWER_CASE, WORD_LEMMA_LOWER_CASE);
		valueProviders.put(TermIndexes.WORD_LEMMA, WORD_LEMMA_PROVIDER);
		valueProviders.put(TermIndexes.WORD_COUPLE_LEMMA_STEM, WORD_LEMMA_STEM_PROVIDER);
		valueProviders.put(TermIndexes.WORD_COUPLE_LEMMA_LEMMA, WORD_LEMMA_LEMMA_PROVIDER);
	}

	public static TermValueProvider get(String key) {
		return valueProviders.get(key);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy