
fr.univnantes.termsuite.tools.opt.ClientHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
package fr.univnantes.termsuite.tools.opt;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import fr.univnantes.termsuite.api.ExtractorOptions;
import fr.univnantes.termsuite.api.ResourceConfig;
import fr.univnantes.termsuite.api.TXTCorpus;
import fr.univnantes.termsuite.api.TermSuite;
import fr.univnantes.termsuite.api.TermSuiteException;
import fr.univnantes.termsuite.engines.cleaner.TerminoFilterOptions;
import fr.univnantes.termsuite.engines.contextualizer.AssociationRate;
import fr.univnantes.termsuite.engines.contextualizer.ContextualizerOptions;
import fr.univnantes.termsuite.engines.gatherer.GathererOptions;
import fr.univnantes.termsuite.engines.postproc.TermRankingOptions;
import fr.univnantes.termsuite.engines.splitter.MorphologicalOptions;
import fr.univnantes.termsuite.framework.TermSuiteFactory;
import fr.univnantes.termsuite.io.tsv.TsvOptions;
import fr.univnantes.termsuite.metrics.SimilarityDistance;
import fr.univnantes.termsuite.model.Lang;
import fr.univnantes.termsuite.model.OccurrenceStore;
import fr.univnantes.termsuite.model.Property;
import fr.univnantes.termsuite.model.TermProperty;
import fr.univnantes.termsuite.resources.PostProcessorOptions;
import fr.univnantes.termsuite.tools.CliUtil;
import fr.univnantes.termsuite.tools.CommandLineClient;
import fr.univnantes.termsuite.tools.TermSuiteCliException;
import fr.univnantes.termsuite.utils.TermHistory;
public class ClientHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(ClientHelper.class);
private CommandLineClient client;
public ClientHelper(CommandLineClient client) {
super();
this.client = client;
}
public ResourceConfig getResourceConfig() {
ResourceConfig resourceConfig = new ResourceConfig();
if(client.isSet(TermSuiteCliOption.RESOURCE_DIR))
resourceConfig.addDirectory(client.asDir(TermSuiteCliOption.RESOURCE_DIR));
if(client.isSet(TermSuiteCliOption.RESOURCE_JAR))
resourceConfig.addJar(client.asPath(TermSuiteCliOption.RESOURCE_JAR));
if(client.isSet(TermSuiteCliOption.RESOURCE_URL_PREFIX))
try {
resourceConfig.addResourcePrefix(new URL(client.asString(TermSuiteCliOption.RESOURCE_URL_PREFIX)));
} catch (MalformedURLException e) {
LOGGER.error("Invalid url prefix: {}", client.asString(TermSuiteCliOption.RESOURCE_URL_PREFIX));
throw new TermSuiteException(e);
}
return resourceConfig;
}
public void declareResourceOpts() {
client.declareFacultative(TermSuiteCliOption.RESOURCE_DIR);
client.declareFacultative(TermSuiteCliOption.RESOURCE_JAR);
client.declareFacultative(TermSuiteCliOption.RESOURCE_URL_PREFIX);
}
public void declareHistory() {
client.declareFacultative(TermSuiteCliOption.WATCH);
}
public Optional getHistory() {
if(client.isSet(TermSuiteCliOption.WATCH)) {
TermHistory value = new TermHistory();
List asTermString = client.asTermString(TermSuiteCliOption.WATCH);
value.addWatchedTermString(asTermString);
return Optional.of(value);
} else
return Optional.empty();
}
public void declareExtractorOptions() {
client.declareFacultative(TermSuiteCliOption.PRE_FILTER_PROPERTY);
client.declareConditional(TermSuiteCliOption.PRE_FILTER_PROPERTY,
TermSuiteCliOption.PRE_FILTER_MAX_VARIANTS_NUM,
TermSuiteCliOption.PRE_FILTER_THRESHOLD,
TermSuiteCliOption.PRE_FILTER_TOP_N);
client.declareAtMostOneOf(TermSuiteCliOption.PRE_FILTER_TOP_N, TermSuiteCliOption.PRE_FILTER_THRESHOLD);
client.declareFacultative(TermSuiteCliOption.CONTEXTUALIZER_ENABLED);
client.declareConditional(TermSuiteCliOption.CONTEXTUALIZER_ENABLED,
TermSuiteCliOption.CONTEXTUALIZER_ASSOC_RATE,
TermSuiteCliOption.CONTEXTUALIZER_MIN_COOC_TH,
TermSuiteCliOption.CONTEXTUALIZER_SCOPE);
client.declareFacultative(TermSuiteCliOption.MORPHOLOGY_DISABLED);
client.declareFacultative(TermSuiteCliOption.MORPHOLOGY_DERIVATIVE_DISABLED);
client.declareFacultative(TermSuiteCliOption.MORPHOLOGY_NATIVE_DISABLED);
client.declareFacultative(TermSuiteCliOption.MORPHOLOGY_PREFIX_DISABLED);
client.declareFacultative(TermSuiteCliOption.GATHERER_DISABLE_MERGER);
client.declareFacultative(TermSuiteCliOption.GATHERER_DISABLE_GATHERING);
client.declareFacultative(TermSuiteCliOption.GATHERER_GRAPH_SIMILARITY_THRESHOLD);
client.declareFacultative(TermSuiteCliOption.GATHERER_ENABLE_SEMANTIC);
client.declareConditional(TermSuiteCliOption.GATHERER_ENABLE_SEMANTIC,
TermSuiteCliOption.GATHERER_SEMANTIC_DISTANCE,
TermSuiteCliOption.GATHERER_SEMANTIC_NB_CANDIDATES,
TermSuiteCliOption.GATHERER_SEMANTIC_SIMILAIRTY_TH,
TermSuiteCliOption.GATHERER_SEMANTIC_DISTANCE,
TermSuiteCliOption.GATHERER_SEMANTIC_SYNONYMS_DICO,
TermSuiteCliOption.GATHERER_SEMANTIC_DICO_ONLY
);
client.declareFacultative(TermSuiteCliOption.POSTPROC_DISABLED);
client.declareFacultative(TermSuiteCliOption.POSTPROC_INDEPENDANCE_TH);
client.declareFacultative(TermSuiteCliOption.POSTPROC_AFFIX_SCORE_TH);
client.declareFacultative(TermSuiteCliOption.POSTPROC_ORTHO_SCORE_TH);
client.declareFacultative(TermSuiteCliOption.POSTPROC_VARIATION_SCORE_TH);
client.declareCannotAppearWhenCondition(TermSuiteCliOption.POSTPROC_DISABLED,
TermSuiteCliOption.POSTPROC_INDEPENDANCE_TH,
TermSuiteCliOption.POSTPROC_AFFIX_SCORE_TH,
TermSuiteCliOption.POSTPROC_ORTHO_SCORE_TH,
TermSuiteCliOption.POSTPROC_VARIATION_SCORE_TH);
client.declareFacultative(TermSuiteCliOption.POST_FILTER_PROPERTY);
client.declareConditional(TermSuiteCliOption.POST_FILTER_PROPERTY,
TermSuiteCliOption.POST_FILTER_KEEP_VARIANTS,
TermSuiteCliOption.POST_FILTER_MAX_VARIANTS_NUM,
TermSuiteCliOption.POST_FILTER_THRESHOLD,
TermSuiteCliOption.POST_FILTER_TOP_N);
client.declareAtMostOneOf(TermSuiteCliOption.POST_FILTER_TOP_N, TermSuiteCliOption.POST_FILTER_THRESHOLD);
client.declareFacultative(TermSuiteCliOption.RANKING_DESC_PROPERTY);
client.declareFacultative(TermSuiteCliOption.RANKING_ASC_PROPERTY);
client.declareAtMostOneOf(TermSuiteCliOption.RANKING_DESC_PROPERTY, TermSuiteCliOption.RANKING_ASC_PROPERTY);
}
public ExtractorOptions getExtractorOptions(Lang lang) {
ExtractorOptions options = TermSuite.getDefaultExtractorConfig(lang);
configureFilter(options.getPreFilterConfig(),
TermSuiteCliOption.PRE_FILTER_PROPERTY,
TermSuiteCliOption.PRE_FILTER_MAX_VARIANTS_NUM,
TermSuiteCliOption.PRE_FILTER_THRESHOLD,
TermSuiteCliOption.PRE_FILTER_TOP_N
);
options.getPreFilterConfig().setKeepVariants(false);
configureContextualizer(options);
configureMorphology(options);
configureGatherer(options);
configurePostProc(options);
configureRanking(options);
configureFilter(options.getPostFilterConfig(),
TermSuiteCliOption.POST_FILTER_PROPERTY,
TermSuiteCliOption.POST_FILTER_MAX_VARIANTS_NUM,
TermSuiteCliOption.POST_FILTER_THRESHOLD,
TermSuiteCliOption.POST_FILTER_TOP_N
);
if(options.getPostFilterConfig().isEnabled())
options.getPostFilterConfig().setKeepVariants(client.asBoolean(TermSuiteCliOption.POST_FILTER_KEEP_VARIANTS));
return options;
}
private void configureRanking(ExtractorOptions options) {
TermRankingOptions config = options.getRankingConfig();
TermProperty termProperty = null;
if(client.isSet(TermSuiteCliOption.RANKING_ASC_PROPERTY)) {
termProperty = client.asTermProperty(TermSuiteCliOption.RANKING_ASC_PROPERTY);
config.setRankingProperty(termProperty);
config.setDesc(false);
ensureNumeric(TermSuiteCliOption.RANKING_ASC_PROPERTY, termProperty);
} else if(client.isSet(TermSuiteCliOption.RANKING_DESC_PROPERTY)) {
termProperty = client.asTermProperty(TermSuiteCliOption.RANKING_DESC_PROPERTY);
config.setRankingProperty(termProperty);
config.setDesc(true);
ensureNumeric(TermSuiteCliOption.RANKING_DESC_PROPERTY, termProperty);
}
}
private void ensureNumeric(CliOption opt, Property> property) {
if(property != null && !property.isNumeric()) {
CliUtil.throwException("\"%s\" is an invalid property for option --%s. Expected a number property.",
property.getJsonField(),
opt.getOptName()
);
}
}
private void configurePostProc(ExtractorOptions options) {
PostProcessorOptions config = options.getPostProcessorConfig();
config.setEnabled(!client.isSet(TermSuiteCliOption.POSTPROC_DISABLED));
if(config.isEnabled()) {
if(client.isSet(TermSuiteCliOption.POSTPROC_INDEPENDANCE_TH))
config.setTermIndependanceTh(client.asDouble(TermSuiteCliOption.POSTPROC_INDEPENDANCE_TH));
if(client.isSet(TermSuiteCliOption.POSTPROC_VARIATION_SCORE_TH))
config.setVariationScoreTh(client.asDouble(TermSuiteCliOption.POSTPROC_VARIATION_SCORE_TH));
}
}
private void configureGatherer(ExtractorOptions options) {
GathererOptions config = options.getGathererConfig();
config.setMergerEnabled(!client.isSet(TermSuiteCliOption.GATHERER_DISABLE_MERGER));
config.setSemanticEnabled(client.isSet(TermSuiteCliOption.GATHERER_ENABLE_SEMANTIC));
config.setSemanticDicoOnly(client.isSet(TermSuiteCliOption.GATHERER_SEMANTIC_DICO_ONLY));
if(client.isSet(TermSuiteCliOption.GATHERER_DISABLE_GATHERING))
config.setEnabled(false);
if(client.isSet(TermSuiteCliOption.GATHERER_GRAPH_SIMILARITY_THRESHOLD))
config.setGraphicalSimilarityThreshold(client.asDouble(TermSuiteCliOption.GATHERER_GRAPH_SIMILARITY_THRESHOLD));
if(config.isSemanticEnabled()) {
if(client.isSet(TermSuiteCliOption.GATHERER_SEMANTIC_DISTANCE)) {
Class extends SimilarityDistance> cls = SimilarityDistance.forName(client.asString(TermSuiteCliOption.GATHERER_SEMANTIC_DISTANCE));
config.setSemanticSimilarityDistance(cls);
}
if(client.isSet(TermSuiteCliOption.GATHERER_SEMANTIC_NB_CANDIDATES))
config.setSemanticNbCandidates(client.asInt(TermSuiteCliOption.GATHERER_SEMANTIC_NB_CANDIDATES));
if(client.isSet(TermSuiteCliOption.GATHERER_SEMANTIC_SIMILAIRTY_TH))
config.setSemanticSimilarityThreshold(client.asDouble(TermSuiteCliOption.GATHERER_SEMANTIC_SIMILAIRTY_TH));
}
}
private void configureMorphology(ExtractorOptions options) {
MorphologicalOptions config = options.getMorphologicalConfig();
config.setEnabled(!client.isSet(TermSuiteCliOption.MORPHOLOGY_DISABLED));
if(config.isEnabled()) {
config.setDerivativesDetecterEnabled(!client.isSet(TermSuiteCliOption.MORPHOLOGY_DERIVATIVE_DISABLED));
config.setNativeSplittingEnabled(!client.isSet(TermSuiteCliOption.MORPHOLOGY_NATIVE_DISABLED));
config.setPrefixSplitterEnabled(!client.isSet(TermSuiteCliOption.MORPHOLOGY_PREFIX_DISABLED));
}
}
private void configureContextualizer(ExtractorOptions options) {
ContextualizerOptions config = options.getContextualizerOptions();
config.setEnabled(client.isSet(TermSuiteCliOption.CONTEXTUALIZER_ENABLED));
if(config.isEnabled()) {
if(client.isSet(TermSuiteCliOption.CONTEXTUALIZER_SCOPE))
config.setScope(client.asInt(TermSuiteCliOption.CONTEXTUALIZER_SCOPE));
if(client.isSet(TermSuiteCliOption.CONTEXTUALIZER_MIN_COOC_TH))
config
.setMinimumCooccFrequencyThreshold(
client.asInt(TermSuiteCliOption.CONTEXTUALIZER_MIN_COOC_TH));
if(client.isSet(TermSuiteCliOption.CONTEXTUALIZER_ASSOC_RATE))
config.setAssociationRate(AssociationRate.forName(
client.asString(TermSuiteCliOption.CONTEXTUALIZER_ASSOC_RATE)));
}
}
private void configureFilter(TerminoFilterOptions filterConfig, CliOption filterProperty,
CliOption maxVariantsNum, CliOption threshold,
TermSuiteCliOption topN) {
filterConfig.setEnabled(client.isSet(filterProperty));
if(filterConfig.isEnabled()) {
TermProperty property = client.asTermProperty(filterProperty);
filterConfig.setProperty(property);
if(client.isSet(maxVariantsNum))
filterConfig.setMaxVariantNum(client.asInt(maxVariantsNum));
if(client.isSet(threshold))
filterConfig.keepOverTh(client.asDouble(threshold));
else if(client.isSet(topN))
filterConfig.keepTopN(client.asInt(topN));
else
CliUtil.throwAtLeast(topN, threshold);
}
}
public void declareTsvOptions() {
client.declareConditional(TermSuiteCliOption.TSV,
TermSuiteCliOption.TSV_PROPERTIES,
TermSuiteCliOption.TSV_HIDE_VARIANTS,
TermSuiteCliOption.TSV_HIDE_HEADER
);
}
public TsvOptions getTsvOptions() {
TsvOptions options = new TsvOptions();
options.setShowVariants(!client.isSet(TermSuiteCliOption.TSV_HIDE_VARIANTS));
if(client.isSet(TermSuiteCliOption.TSV_PROPERTIES)) {
Collection> properties = new ArrayList<>();
for(String str:client.asList(TermSuiteCliOption.TSV_PROPERTIES))
properties.add(client.asProperty(str));
options.properties(properties);
}
options.showHeaders(!client.isSet(TermSuiteCliOption.TSV_HIDE_HEADER));
return options;
}
public TXTCorpus getTxtCorpus() {
Path ascorpusPath = client.asDir(TermSuiteCliOption.FROM_TXT_CORPUS_PATH);
TXTCorpus txtCorpus = new TXTCorpus(client.getLang(), ascorpusPath);
if(txtCorpus.documents().findAny().isPresent())
return txtCorpus;
else
throw new TermSuiteCliException("No txt document found in corpus " + txtCorpus);
}
public int getCappedSize() {
if(client.isSet(TermSuiteCliOption.CAPPED_SIZE))
return client.asInt(TermSuiteCliOption.CAPPED_SIZE);
else
return 500000;
}
public OccurrenceStore getOccurrenceStore(Lang lang) {
if(client.isSet(TermSuiteCliOption.NO_OCCURRENCE)) {
return TermSuiteFactory.createEmptyOccurrenceStore(lang);
} else
return TermSuiteFactory.createMemoryOccurrenceStore(lang);
}
public void declareBigCorpusOptions() {
/*
* Big corpus options
*/
client.declareFacultative(TermSuiteCliOption.CAPPED_SIZE);
client.declareFacultative(TermSuiteCliOption.NO_OCCURRENCE);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy