All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.univnantes.termsuite.engines.cleaner.TerminologyCleaner Maven / Gradle / Ivy

Go to download

A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment

The newest version!
package fr.univnantes.termsuite.engines.cleaner;

import static java.util.stream.Collectors.toSet;

import java.util.Optional;
import java.util.Set;
import java.util.stream.Stream;

import javax.inject.Inject;

import org.slf4j.Logger;

import fr.univnantes.termsuite.engines.SimpleEngine;
import fr.univnantes.termsuite.engines.cleaner.TerminoFilterOptions.FilterType;
import fr.univnantes.termsuite.framework.InjectLogger;
import fr.univnantes.termsuite.framework.Parameter;
import fr.univnantes.termsuite.framework.service.RelationService;
import fr.univnantes.termsuite.framework.service.TermService;
import fr.univnantes.termsuite.framework.service.TerminologyService;
import fr.univnantes.termsuite.model.RelationProperty;
import fr.univnantes.termsuite.model.Term;
import fr.univnantes.termsuite.model.TermProperty;
import fr.univnantes.termsuite.utils.TermHistory;

public class TerminologyCleaner extends SimpleEngine {
	@InjectLogger Logger logger;

	@Parameter
	private TerminoFilterOptions options;
	
	@Inject TerminologyService termino;
	
	public TerminologyCleaner setHistory(Optional history) {
		this.history = history;
		return this;
	}
	
	public TerminologyCleaner setOptions(TerminoFilterOptions options) {
		this.options = options;
		return this;
	}
	
	@Override
	public void execute() {
		long termcount = 0;
		long relcount = 0;
		if(logger.isDebugEnabled()) {
			termcount = termino.termCount();
			relcount = termino.variations().count();
		}
		
		setTermFiltered(termino);
		cleanVariations(termino);
		cleanFilteredTerms(termino);
		
		if(logger.isDebugEnabled()) {
			long termcountAfter = termino.termCount();
			long relcountAfter = termino.variations().count();
			logger.debug("At end of filtering - Number of terms: {} (num of filtered terms: {})", termcountAfter, termcount-termcountAfter);
			logger.debug("At end of filtering - Number of variations: {} (num of filtered variations: {})", relcountAfter, relcount-relcountAfter);
		}
	}

	private void cleanFilteredTerms(TerminologyService termino) {
		Stream terms = termino.terms();
		if(!options.isKeepVariants()) 
			terms = terms
				.filter(TermService::isFiltered);
		else {
			Set keptTerms = termino.variations()
					.map(RelationService::getTo)
					.collect(toSet());

			terms = terms
				.filter(term -> term.isFiltered() && !keptTerms.contains(term));
		}
		
		terms
			.collect(toSet())
			.stream()
			.map(TermService::getTerm)
			.forEach(termino::removeTerm);
	}
	
	private void cleanVariations(TerminologyService termino) {
		termino.variations()
			.filter(variation -> variation.isPropertySet(RelationProperty.VARIATION_RANK))
			.filter(variation -> variation.getRank() > options.getMaxVariantNum())
			.collect(toSet())
			.forEach(termino::removeRelation);

		termino.variations()
			.filter(variation -> variation.getFrom().isFiltered())
			.collect(toSet())
			.forEach(termino::removeRelation);
	}

	public void setTermFiltered(TerminologyService termino) {
		termino.terms().forEach(term -> {
			term.setProperty(TermProperty.FILTERED, true);
		});
		
		Stream terms = termino.terms().map(TermService::getTerm);
		if(options.getFilterType() == FilterType.TOP_N) 
			terms = terms
				.filter(term -> term.isPropertySet(options.getFilterProperty()))
				.sorted(options.getFilterProperty().getComparator(true))
				.limit(options.getTopN());
		
		if(options.getFilterType() == FilterType.THRESHOLD) 
			terms = terms
				.filter(term -> term.isPropertySet(options.getFilterProperty()))
				.filter(term -> term.getNumber(options.getFilterProperty()).doubleValue() >= options.getThreshold().doubleValue());

		terms.forEach(term -> {
			term.setProperty(TermProperty.FILTERED, false);
			watchMarkedAsFiltered(term);
		});
	}

	private void watchMarkedAsFiltered(Term term) {
		if(history.isPresent()) {
			if(history.get().isTermWatched(term)) {
				history.get().saveEvent(term, this.getClass(), String.format(
						"Term %s is marked as filtered because %s=%s",
						term,
						options.getFilterProperty(),
						term.get(options.getFilterProperty())
					));
			}
		}
	}
}






© 2015 - 2024 Weber Informatics LLC | Privacy Policy