![JAR search and dependency download from the Maven repository](/logo.png)
eu.project.ttc.tools.TermSuitePipeline Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
/*******************************************************************************
* Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.tools;
import java.io.Serializable;
import java.math.BigInteger;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.security.SecureRandom;
import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ExternalResourceDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import eu.project.ttc.engines.AffixCompoundSplitter;
import eu.project.ttc.engines.CasStatCounter;
import eu.project.ttc.engines.CompostAE;
import eu.project.ttc.engines.CompoundSplitter;
import eu.project.ttc.engines.Contextualizer;
import eu.project.ttc.engines.EvalEngine;
import eu.project.ttc.engines.ExtensionDetecter;
import eu.project.ttc.engines.GraphicalVariantGatherer;
import eu.project.ttc.engines.MateLemmaFixer;
import eu.project.ttc.engines.MateLemmatizerTagger;
import eu.project.ttc.engines.Merger;
import eu.project.ttc.engines.PipelineObserver;
import eu.project.ttc.engines.PrimaryOccurrenceDetector;
import eu.project.ttc.engines.Ranker;
import eu.project.ttc.engines.RegexSpotter;
import eu.project.ttc.engines.ScorerAE;
import eu.project.ttc.engines.StringRegexFilter;
import eu.project.ttc.engines.SyntacticTermGatherer;
import eu.project.ttc.engines.TermClassifier;
import eu.project.ttc.engines.TermIndexBlacklistWordFilterAE;
import eu.project.ttc.engines.TermSpecificityComputer;
import eu.project.ttc.engines.TreeTaggerLemmaFixer;
import eu.project.ttc.engines.cleaner.AbstractTermIndexCleaner;
import eu.project.ttc.engines.cleaner.FilterRules;
import eu.project.ttc.engines.cleaner.MaxSizeThresholdCleaner;
import eu.project.ttc.engines.cleaner.TermIndexThresholdCleaner;
import eu.project.ttc.engines.cleaner.TermIndexTopNCleaner;
import eu.project.ttc.engines.cleaner.TermProperty;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.engines.desc.TermSuiteCollection;
import eu.project.ttc.engines.desc.TermSuitePipelineException;
import eu.project.ttc.engines.exporter.CompoundExporter;
import eu.project.ttc.engines.exporter.EvalExporter;
import eu.project.ttc.engines.exporter.ExportVariationRuleExamples;
import eu.project.ttc.engines.exporter.JsonCasExporter;
import eu.project.ttc.engines.exporter.JsonExporter;
import eu.project.ttc.engines.exporter.SpotterTSVWriter;
import eu.project.ttc.engines.exporter.TBXExporter;
import eu.project.ttc.engines.exporter.TSVExporter;
import eu.project.ttc.engines.exporter.VariantEvalExporter;
import eu.project.ttc.engines.exporter.XmiCasExporter;
import eu.project.ttc.metrics.LogLikelihood;
import eu.project.ttc.models.OccurrenceStore;
import eu.project.ttc.models.OccurrenceType;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.index.MemoryTermIndex;
import eu.project.ttc.models.occstore.MemoryOccurrenceStore;
import eu.project.ttc.models.occstore.MongoDBOccurrenceStore;
import eu.project.ttc.readers.AbstractToTxtSaxHandler;
import eu.project.ttc.readers.CollectionDocument;
import eu.project.ttc.readers.EmptyCollectionReader;
import eu.project.ttc.readers.GenericXMLToTxtCollectionReader;
import eu.project.ttc.readers.QueueRegistry;
import eu.project.ttc.readers.StreamingCollectionReader;
import eu.project.ttc.readers.StringCollectionReader;
import eu.project.ttc.readers.TeiCollectionReader;
import eu.project.ttc.readers.TxtCollectionReader;
import eu.project.ttc.resources.BankResource;
import eu.project.ttc.resources.CharacterFootprintTermFilter;
import eu.project.ttc.resources.CompostInflectionRules;
import eu.project.ttc.resources.DictionaryResource;
import eu.project.ttc.resources.EvalTrace;
import eu.project.ttc.resources.GeneralLanguageResource;
import eu.project.ttc.resources.MateLemmatizerModel;
import eu.project.ttc.resources.MateTaggerModel;
import eu.project.ttc.resources.MemoryTermIndexManager;
import eu.project.ttc.resources.ObserverResource;
import eu.project.ttc.resources.ReferenceTermList;
import eu.project.ttc.resources.SimpleWordSet;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.resources.TermSuitePipelineObserver;
import eu.project.ttc.resources.YamlVariantRules;
import eu.project.ttc.stream.CasConsumer;
import eu.project.ttc.stream.ConsumerRegistry;
import eu.project.ttc.stream.DocumentProvider;
import eu.project.ttc.stream.DocumentStream;
import eu.project.ttc.stream.StreamingCasConsumer;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.OccurrenceBuffer;
import eu.project.ttc.utils.TermSuiteUtils;
import fr.free.rocheteau.jerome.engines.Stemmer;
import fr.univnantes.lina.uima.ChineseSegmenterResourceHelper;
import fr.univnantes.lina.uima.engines.ChineseSegmenter;
import fr.univnantes.lina.uima.engines.TreeTaggerWrapper;
import fr.univnantes.lina.uima.models.ChineseSegmentResource;
import fr.univnantes.lina.uima.models.TreeTaggerParameter;
import fr.univnantes.lina.uima.tkregex.ae.RegexListResource;
import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE;
import uima.sandbox.filter.resources.DefaultFilterResource;
import uima.sandbox.filter.resources.FilterResource;
import uima.sandbox.lexer.engines.Lexer;
import uima.sandbox.lexer.resources.SegmentBank;
import uima.sandbox.lexer.resources.SegmentBankResource;
import uima.sandbox.mapper.engines.Mapper;
import uima.sandbox.mapper.resources.Mapping;
import uima.sandbox.mapper.resources.MappingResource;
/*
* TODO Integrates frozen expressions
* TODO integrate Sonar runner
* TODO Add functional pipeline TestCases for each collection type and for different pipeline configs
*/
/**
* A collection reader and ae aggregator (builder pattern) that
* creates and runs a full pipeline.
*
* @author Damien Cram
*
*/
public class TermSuitePipeline {
/* The Logger */
private static final Logger LOGGER = LoggerFactory.getLogger(TermSuitePipeline.class);
/* ******************************
* MAIN PIPELINE PARAMETERS
*/
private OccurrenceStore occurrenceStore = new MemoryOccurrenceStore();
private Optional extends TermIndex> termIndex = Optional.absent();
private Lang lang;
private CollectionReaderDescription crDescription;
private String pipelineObserverName;
private AggregateBuilder aggregateBuilder;
private TermSuiteResourceHelper resFactory;
/*
* POS Tagger parameters
*/
private Optional mateModelsPath = Optional.absent();
private Optional treeTaggerPath = Optional.absent();
/*
* Regex Spotter params
*/
private boolean addSpottedAnnoToTermIndex = true;
@Deprecated
private boolean spotWithOccurrences = true;
private Optional logOverlappingRules = Optional.absent();
private Optional postProcessingStrategy = Optional.absent();
private boolean enableSyntacticLabels = false;
@Deprecated
private Optional syntacticRegexesFilePath = Optional.absent();
/*
* Contextualizer options
*/
private OccurrenceType contextualizeCoTermsType = OccurrenceType.SINGLE_WORD;
private boolean contextualizeWithTermClasses = false;
private int contextualizeWithCoOccurrenceFrequencyThreshhold = 1;
private String contextAssocRateMeasure = LogLikelihood.class.getName();
/*
* Cleaner properties
*/
private boolean keepVariantsWhileCleaning = false;
/*
* Compost Params
*/
private float alpha = 0.5f;
private float beta = 0.1f;
private float gamma = 0.1f;
private float delta = 0.3f;
private float compostScoreThreshold = 0.7f;
private int compostMinComponentSize = 3;
private int compostMaxComponentNum = 3;
private Object compostSegmentSimilarityThreshold = 0.7f;
/*
* Syntactic Variant Gatherer parameters
*/
@Deprecated
private Optional yamlVariantRulesFilePath = Optional.absent();
/*
* Graphical Variant Gatherer parameters
*/
private Optional graphicalVariantSimilarityThreshold = Optional.absent();
/*
* Export Parameters
*/
private String exportFilteringRule = FilterRules.SpecificityThreshold.name();
private float exportFilteringThreshold = 0;
/* JSON */
private boolean exportJsonWithOccurrences = true;
private boolean exportJsonWithContext = false;
private boolean linkMongoStore = false;
/* TSV */
private String tsvExportProperties = "groupingKey,wr";
private boolean tsvWithVariantScores = false;
private boolean tsvWithHeaders = true;
/*
* Streaming parameters
*/
private Thread streamThread = null;
private DocumentProvider documentProvider;
/* *******************
* CONSTRUCTORS
*/
private TermSuitePipeline(String lang, String urlPrefix) {
this.lang = Lang.forName(lang);
if(urlPrefix == null)
this.resFactory = new TermSuiteResourceHelper(this.lang);
else
this.resFactory = new TermSuiteResourceHelper(this.lang, urlPrefix);
this.aggregateBuilder = new AggregateBuilder();
this.pipelineObserverName = PipelineObserver.class.getSimpleName() + "-" + Thread.currentThread().getId() + "-" + System.currentTimeMillis();
TermSuiteResourceManager.getInstance().register(pipelineObserverName, new TermSuitePipelineObserver(2,1));
}
public static TermSuitePipeline create(String lang) {
return new TermSuitePipeline(lang, null);
}
/**
*
* Starts a chaining {@link TermSuitePipeline} builder and overrides the default
* {@link URL} prefix (file:).
*
* @param lang
* The
* @param urlPrefix
* The {@link URL} prefix to use for accessing TermSuite resources
* @return
* The chaining builder.
*
* @see TermSuiteResourceHelper#TermSuiteResourceHelper(Lang, String)
*/
public static TermSuitePipeline create(String lang, String urlPrefix) {
return new TermSuitePipeline(lang, urlPrefix);
}
public static TermSuitePipeline create(TermIndex termIndex, String urlPrefix) {
Preconditions.checkNotNull(termIndex.getName(), "The term index must have a name before it can be used in TermSuitePipeline");
TermSuitePipeline pipeline = create(termIndex.getLang().getCode(), urlPrefix);
pipeline.emptyCollection();
pipeline.setTermIndex(termIndex);
return pipeline;
}
/* *******************************
* RUNNERS
*/
/**
* Runs the pipeline with {@link SimplePipeline} on the {@link CollectionReader} that must have been defined.
*
* @throws TermSuitePipelineException if no {@link CollectionReader} has been declared on this pipeline
*/
public TermSuitePipeline run() {
checkCR();
runPipeline();
return this;
}
private void runPipeline() {
try {
SimplePipeline.runPipeline(this.crDescription, createDescription());
terminates();
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public DocumentStream stream(CasConsumer consumer) {
try {
String id = new BigInteger(130, new SecureRandom()).toString(8);
String casConsumerName = "pipeline-"+id+"-consumer";
ConsumerRegistry.getInstance().registerConsumer(casConsumerName, consumer);
String queueName = "pipeline-"+id+"-queue";
final BlockingQueue q = QueueRegistry.getInstance().registerQueue(queueName, 10);
/*
* 1- Creates the streaming collection reader desc
*/
this.crDescription = CollectionReaderFactory.createReaderDescription(
StreamingCollectionReader.class,
StreamingCollectionReader.PARAM_LANGUAGE, this.lang.getCode(),
StreamingCollectionReader.PARAM_NAME, queueName,
StreamingCollectionReader.PARAM_QUEUE_NAME, queueName
);
/*
* 2- Aggregate the consumer AE
*/
AnalysisEngineDescription consumerAE = AnalysisEngineFactory.createEngineDescription(
StreamingCasConsumer.class,
StreamingCasConsumer.PARAM_CONSUMER_NAME, casConsumerName
);
this.aggregateBuilder.add(consumerAE);
/*
* 3- Starts the pipeline in a separate Thread
*/
this.streamThread = new Thread() {
@Override
public void run() {
runPipeline();
}
};
this.streamThread.start();
/*
* 4- Bind user inputs to the queue
*/
documentProvider = new DocumentProvider() {
@Override
public void provide(CollectionDocument doc) {
try {
q.put(doc);
} catch (InterruptedException e) {
LOGGER.warn("Interrupted while there were more documents waiting.");
}
}
};
return new DocumentStream(streamThread, documentProvider, consumer, queueName);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public Thread getStreamThread() {
return streamThread;
}
private void checkCR() {
if(crDescription == null)
throw new TermSuitePipelineException("No collection reader has been declared on this pipeline.");
}
private void terminates() {
if(termIndex.isPresent() && termIndex.get().getOccurrenceStore() instanceof MongoDBOccurrenceStore)
((MongoDBOccurrenceStore)termIndex.get().getOccurrenceStore()).close();
}
/**
* Registers a pipeline listener.
*
* @param pipelineListener
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline addPipelineListener(PipelineListener pipelineListener) {
TermSuiteResourceManager manager = TermSuiteResourceManager.getInstance();
((TermSuitePipelineObserver)manager.get(pipelineObserverName)).registerListener(pipelineListener);
return this;
}
/**
* Runs the pipeline with {@link SimplePipeline} without requiring a {@link CollectionReader}
* to be defined.
* @param cas the {@link JCas} on which the pipeline operates.
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline run(JCas cas) {
try {
SimplePipeline.runPipeline(cas, createDescription());
terminates();
return this;
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline setInlineString(String text) {
try {
this.crDescription = CollectionReaderFactory.createReaderDescription(
StringCollectionReader.class,
StringCollectionReader.PARAM_TEXT, text,
StringCollectionReader.PARAM_LANGUAGE, this.lang.getCode()
);
return this;
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
* Creates a collection reader for this pipeline.
*
* @param termSuiteCollection
* @param collectionPath
* @param collectionEncoding
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setCollection(TermSuiteCollection termSuiteCollection, String collectionPath, String collectionEncoding) {
Preconditions.checkNotNull(termSuiteCollection);
Preconditions.checkNotNull(collectionPath);
Preconditions.checkNotNull(collectionEncoding);
try {
switch(termSuiteCollection) {
case TEI:
this.crDescription = CollectionReaderFactory.createReaderDescription(
TeiCollectionReader.class,
TeiCollectionReader.PARAM_INPUTDIR, collectionPath,
TxtCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection,
TeiCollectionReader.PARAM_ENCODING, collectionEncoding,
TeiCollectionReader.PARAM_LANGUAGE, this.lang.getCode()
);
break;
case TXT:
this.crDescription = CollectionReaderFactory.createReaderDescription(
TxtCollectionReader.class,
TxtCollectionReader.PARAM_INPUTDIR, collectionPath,
TxtCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection,
TxtCollectionReader.PARAM_ENCODING, collectionEncoding,
TxtCollectionReader.PARAM_LANGUAGE, this.lang.getCode()
);
break;
case EMPTY:
this.crDescription = CollectionReaderFactory.createReaderDescription(
EmptyCollectionReader.class
);
break;
default:
throw new IllegalArgumentException("No such collection: " + termSuiteCollection);
}
return this;
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
* Creates a collection reader of type {@link GenericXMLToTxtCollectionReader} for this pipeline.
*
* Requires a list of dropped tags and txt tags for collection parsing.
*
* @see AbstractToTxtSaxHandler
*
* @param termSuiteCollection
* @param collectionPath
* @param collectionEncoding
* @param droppedTags
* @param txtTags
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setCollection(TermSuiteCollection termSuiteCollection, String collectionPath, String collectionEncoding, String droppedTags, String txtTags) {
try {
this.crDescription = CollectionReaderFactory.createReaderDescription(
GenericXMLToTxtCollectionReader.class,
GenericXMLToTxtCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection,
GenericXMLToTxtCollectionReader.PARAM_DROPPED_TAGS, droppedTags,
GenericXMLToTxtCollectionReader.PARAM_TXT_TAGS, txtTags,
GenericXMLToTxtCollectionReader.PARAM_INPUTDIR, collectionPath,
GenericXMLToTxtCollectionReader.PARAM_ENCODING, collectionEncoding,
GenericXMLToTxtCollectionReader.PARAM_LANGUAGE, this.lang.getCode()
);
return this;
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline setResourcePath(String resourcePath) {
TermSuiteUtils.addToClasspath(resourcePath);
return this;
}
public TermSuitePipeline setContextAssocRateMeasure(String contextAssocRateMeasure) {
this.contextAssocRateMeasure = contextAssocRateMeasure;
return this;
}
public TermSuitePipeline emptyCollection() {
return setCollection(TermSuiteCollection.EMPTY, "", "UTF-8");
}
public AnalysisEngineDescription createDescription() {
try {
return this.aggregateBuilder.createAggregateDescription();
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline aeWordTokenizer() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
Lexer.class,
Lexer.PARAM_TYPE, "eu.project.ttc.types.WordAnnotation"
);
ExternalResourceDescription segmentBank = ExternalResourceFactory.createExternalResourceDescription(
SegmentBankResource.class,
resFactory.getSegmentBank().toString());
ExternalResourceFactory.bindResource(ae, SegmentBank.KEY_SEGMENT_BANK, segmentBank);
return aggregateAndReturn(ae, "Word tokenizer", 0);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
// private TermSuitePipeline aggregateAndReturn(AnalysisEngineDescription ae) {
// return aggregateAndReturn(ae, null, 0);
// }
private TermSuitePipeline aggregateAndReturn(AnalysisEngineDescription ae, String taskName, int ccWeight) {
Preconditions.checkNotNull(taskName);
// Add the pre-task observer
this.aggregateBuilder.add(aeObserver(taskName, ccWeight, PipelineObserver.TASK_STARTED));
// Add the ae itself
this.aggregateBuilder.add(ae);
// Add the post-task observer
this.aggregateBuilder.add(aeObserver(taskName, ccWeight, PipelineObserver.TASK_ENDED));
return this;
}
private AnalysisEngineDescription aeObserver(String taskName, int weight, String hook) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
PipelineObserver.class,
PipelineObserver.TASK_NAME, taskName,
PipelineObserver.HOOK, hook,
PipelineObserver.WEIGHT, weight
);
ExternalResourceFactory.bindResource(ae, resObserver());
return ae;
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline aeTreeTagger() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
TreeTaggerWrapper.class,
TreeTaggerWrapper.PARAM_ANNOTATION_TYPE, "eu.project.ttc.types.WordAnnotation",
TreeTaggerWrapper.PARAM_TAG_FEATURE, "tag",
TreeTaggerWrapper.PARAM_LEMMA_FEATURE, "lemma",
TreeTaggerWrapper.PARAM_UPDATE_ANNOTATION_FEATURES, true,
TreeTaggerWrapper.PARAM_TT_HOME_DIRECTORY, this.treeTaggerPath.get()
);
ExternalResourceFactory.createDependencyAndBind(
ae,
TreeTaggerParameter.KEY_TT_PARAMETER,
TreeTaggerParameter.class,
resFactory.getTTParameter().toString());
return aggregateAndReturn(ae, "POS Tagging (TreeTagger)", 0).ttLemmaFixer().ttNormalizer();
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline setMateModelPath(String path) {
this.mateModelsPath = Optional.of(path);
Preconditions.checkArgument(Files.exists(Paths.get(path)), "Directory %s does not exist", path);
Preconditions.checkArgument(Files.isDirectory(Paths.get(path)), "File %s is not a directory", path);
return this;
}
public TermSuitePipeline aeMateTaggerLemmatizer() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
MateLemmatizerTagger.class
);
Preconditions.checkState(mateModelsPath.isPresent(), "The path to mate models must be explicitely given. See method #setMateModelPath");
String lemmatizerModel = Paths.get(mateModelsPath.get(), resFactory.getMateLemmatizerModelFileName()).toString();
String taggerModel = Paths.get(mateModelsPath.get(), resFactory.getMateTaggerModelFileName()).toString();
Preconditions.checkArgument(Files.exists(Paths.get(lemmatizerModel)), "Lemmatizer model does not exist: %s", lemmatizerModel);
Preconditions.checkArgument(Files.exists(Paths.get(taggerModel)), "Tagger model does not exist: %s", taggerModel);
ExternalResourceFactory.createDependencyAndBind(
ae,
MateLemmatizerTagger.LEMMATIZER,
MateLemmatizerModel.class,
lemmatizerModel);
ExternalResourceFactory.createDependencyAndBind(
ae,
MateLemmatizerTagger.TAGGER,
MateTaggerModel.class,
taggerModel);
return aggregateAndReturn(ae, "POS Tagging (Mate)", 0)
.mateLemmaFixer()
.mateNormalizer();
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
* Defines the term properties that appear in tsv export file
*
* @see #haeTsvExporter(String)
* @param properties
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setTsvExportProperties(TermProperty... properties) {
this.tsvExportProperties = Joiner.on(",").join(properties);
return this;
}
/**
* Exports the {@link TermIndex} in tsv format
*
* @see #setTsvExportProperties(TermProperty...)
* @param toFilePath
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline haeTsvExporter(String toFilePath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
TSVExporter.class,
TSVExporter.TO_FILE_PATH, toFilePath,
TSVExporter.TERM_PROPERTIES, this.tsvExportProperties,
TSVExporter.SHOW_HEADERS, tsvWithHeaders,
TSVExporter.SHOW_SCORES, tsvWithVariantScores
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting the terminology to " + toFilePath, 1);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Exports examples of matching pairs for each variation rule.
*
* @param toFilePath
* the file path where to write the examples for each variation rules
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline haeExportVariationRuleExamples(String toFilePath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
ExportVariationRuleExamples.class, ExportVariationRuleExamples.TO_FILE_PATH, toFilePath);
ExternalResourceFactory.bindResource(ae, resTermIndex());
ExternalResourceFactory.bindResource(ae, resSyntacticVariantRules());
return aggregateAndReturn(ae, "Exporting variation rules examples", 0);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Exports all compound words of the terminology to given file path.
*
* @param toFilePath
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline haeCompoundExporter(String toFilePath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
CompoundExporter.class,
CompoundExporter.TO_FILE_PATH,
toFilePath);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting compounds", 0);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline haeTbxExporter(String toFilePath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
TBXExporter.class,
TBXExporter.FILTERING_RULE, exportFilteringRule,
TBXExporter.FILTERING_THRESHOLD, exportFilteringThreshold,
TBXExporter.TO_FILE_PATH, toFilePath,
TBXExporter.LANGUAGE, this.lang
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting the terminology to " + toFilePath, 1);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline haeEvalExporter(String toFilePath, boolean withVariants) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
EvalExporter.class,
EvalExporter.FILTERING_RULE, FilterRules.SpecificityThreshold,
EvalExporter.FILTERING_THRESHOLD, 1.0f,
EvalExporter.TO_FILE_PATH, toFilePath,
EvalExporter.WITH_VARIANTS, withVariants
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting evaluation files", 0);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline setExportJsonWithOccurrences(boolean exportJsonWithOccurrences) {
this.exportJsonWithOccurrences = exportJsonWithOccurrences;
return this;
}
public TermSuitePipeline setExportJsonWithContext(boolean b) {
this.exportJsonWithContext = b;
return this;
}
public TermSuitePipeline haeJsonExporter(String toFilePath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
JsonExporter.class,
JsonExporter.TO_FILE_PATH, toFilePath,
JsonExporter.WITH_OCCURRENCE, exportJsonWithOccurrences,
JsonExporter.WITH_CONTEXTS, exportJsonWithContext,
JsonExporter.LINKED_MONGO_STORE, this.linkMongoStore
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting the terminology to " + toFilePath, 1);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Creates a tsv output with :
* - the occurrence list of each term and theirs in-text contexts.
* - a json structure for the evaluation of each variant
*
* @param toFilePath
* The output file path
* @param topN
* The number of variants to keep in the file
* @param maxVariantsPerTerm
* The maximum number of variants to eval for each term
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline haeVariantEvalExporter(String toFilePath, int topN, int maxVariantsPerTerm) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
VariantEvalExporter.class,
VariantEvalExporter.FILTERING_RULE, eu.project.ttc.engines.cleaner.FilterRules.SpecificityThreshold,
VariantEvalExporter.FILTERING_THRESHOLD, 1.0f,
VariantEvalExporter.TO_FILE_PATH, toFilePath,
VariantEvalExporter.TOP_N, topN,
VariantEvalExporter.NB_VARIANTS_PER_TERM, maxVariantsPerTerm
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting variant evaluation files", 0);
} catch (Exception e) {
throw new TermSuitePipelineException(e);
}
}
private void addParameters(AnalysisEngineDescription ae, Object... parameters) {
if(parameters.length % 2 == 1)
throw new IllegalArgumentException("Expecting even number of arguements for key-value pairs: " + parameters.length);
for(int i=0; i= 2.
*
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline aeExtensionDetector() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
ExtensionDetecter.class
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Detecting term extensions", 1);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
* Transforms the {@link TermIndex} into a flat one-n scored model.
*
*
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline aeScorer() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
ScorerAE.class
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
ExternalResourceFactory.bindResource(ae, resObserver());
return aggregateAndReturn(ae, ScorerAE.TASK_NAME, 1);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
* Merges the variants (only those who are extensions of the base term)
* of a terms by graphical variation.
*
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline aeMerger() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
Merger.class,
Merger.SIMILARITY_THRESHOLD, 0.9f
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
ExternalResourceFactory.bindResource(ae, resObserver());
return aggregateAndReturn(ae, Merger.TASK_NAME, 1);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Sets the {@link Term#setRank(int)} of all terms of the {@link TermIndex}
* given a {@link TermProperty}.
*
* @param property
* @param desc
* @return
*/
public TermSuitePipeline aeRanker(TermProperty property, boolean desc) {
Preconditions.checkArgument(property != TermProperty.RANK, "Cannot rank on property %s", TermProperty.RANK);
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
Ranker.class,
Ranker.RANKING_PROPERTY, property,
Ranker.DESC, desc
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
ExternalResourceFactory.bindResource(ae, resObserver());
return aggregateAndReturn(ae, Ranker.TASK_NAME, 1);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline setExportFilteringRule(String exportFilteringRule) {
this.exportFilteringRule = exportFilteringRule;
return this;
}
public TermSuitePipeline setExportFilteringThreshold(float exportFilteringThreshold) {
this.exportFilteringThreshold = exportFilteringThreshold;
return this;
}
public TermSuitePipeline setTreeTaggerHome(String treeTaggerPath) {
this.treeTaggerPath = Optional.of(treeTaggerPath);
return this;
}
/**
* @deprecated Overrides ressources directly
* @param syntacticRegexesFilePath
* @return
*/
@Deprecated
public TermSuitePipeline setSyntacticRegexesFilePath(String syntacticRegexesFilePath) {
this.syntacticRegexesFilePath = Optional.of(syntacticRegexesFilePath);
return this;
}
public TermSuitePipeline haeLogOverlappingRules() {
this.logOverlappingRules = Optional.of(true);
return this;
}
public TermSuitePipeline enableSyntacticLabels() {
this.enableSyntacticLabels = true;
return this;
}
/**
* Overrides ressources directly
* @param yamlVariantRulesFilePath
* @return
*/
@Deprecated
public TermSuitePipeline setYamlVariantRulesFilePath(String yamlVariantRulesFilePath) {
this.yamlVariantRulesFilePath = Optional.of(yamlVariantRulesFilePath);
return this;
}
public TermSuitePipeline setCompostCoeffs(float alpha, float beta, float gamma, float delta) {
Preconditions.checkArgument(alpha + beta + gamma + delta == 1.0f, "The sum of coeff must be 1.0");
this.alpha = alpha;
this.beta = beta;
this.gamma = gamma;
this.delta = delta;
return this;
}
public TermSuitePipeline setCompostMaxComponentNum(int compostMaxComponentNum) {
this.compostMaxComponentNum = compostMaxComponentNum;
return this;
}
public TermSuitePipeline setCompostMinComponentSize(int compostMinComponentSize) {
this.compostMinComponentSize = compostMinComponentSize;
return this;
}
public TermSuitePipeline setCompostScoreThreshold(float compostScoreThreshold) {
this.compostScoreThreshold = compostScoreThreshold;
return this;
}
public TermSuitePipeline setCompostSegmentSimilarityThreshold(
Object compostSegmentSimilarityThreshold) {
this.compostSegmentSimilarityThreshold = compostSegmentSimilarityThreshold;
return this;
}
public TermSuitePipeline aeCompostSplitter() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
CompostAE.class,
CompostAE.SCORE_THRESHOLD, this.compostScoreThreshold,
CompostAE.ALPHA, alpha,
CompostAE.BETA, beta,
CompostAE.GAMMA, gamma,
CompostAE.DELTA, delta,
CompostAE.MIN_COMPONENT_SIZE, this.compostMinComponentSize,
CompostAE.MAX_NUMBER_OF_COMPONENTS, this.compostMaxComponentNum,
CompostAE.SEGMENT_SIMILARITY_THRESHOLD, this.compostSegmentSimilarityThreshold
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
ExternalResourceFactory.bindResource(ae, resObserver());
ExternalResourceFactory.createDependencyAndBind(
ae,
CompostAE.LANGUAGE_DICO,
SimpleWordSet.class,
resFactory.getLanguageDico().toString());
ExternalResourceFactory.createDependencyAndBind(
ae,
CompostAE.INFLECTION_RULES,
CompostInflectionRules.class,
resFactory.getCompostInflectionRules().toString());
ExternalResourceFactory.createDependencyAndBind(
ae,
CompostAE.TRANSFORMATION_RULES,
CompostInflectionRules.class,
resFactory.getCompostTransformationRules().toString());
ExternalResourceFactory.createDependencyAndBind(
ae,
CompostAE.STOP_LIST,
SimpleWordSet.class,
resFactory.getCompostStopList().toString());
ExternalResourceFactory.createDependencyAndBind(
ae,
CompostAE.NEOCLASSICAL_PREFIXES,
SimpleWordSet.class,
resFactory.getNeoclassicalPrefixes().toString());
return aggregateAndReturn(ae, CompostAE.TASK_NAME, 2);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
public TermSuitePipeline haeCasStatCounter(String statName) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
CasStatCounter.class,
CasStatCounter.STAT_NAME, statName
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Counting stats ["+statName+"]", 0);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Exports time progress to TSV file.
*
* Columns are :
*
* - elapsed time from initialization in milliseconds
* - number of docs processed
* - cumulated size of data processed
* - number of terms in term index
* - number of {@link WordAnnotation} processed
*
*
*
* @param toFile
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline haeTraceTimePerf(String toFile) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
CasStatCounter.class,
CasStatCounter.DOCUMENT_PERIOD, 1,
CasStatCounter.TO_TRACE_FILE, toFile
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Exporting time performances to file " + toFile, 0);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* @see TermClassifier
* @param sortingProperty
* the term property used to order terms before they are classified.
* The first term of a class appearing given this order will be considered
* as the head of the class.
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline aeTermClassifier(TermProperty sortingProperty) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
TermClassifier.class,
TermClassifier.CLASSIFYING_PROPERTY, sortingProperty
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
return aggregateAndReturn(ae, "Classifying ters on property " + sortingProperty.toString().toLowerCase(), 0);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* @param refFileURI
* The path to reference termino
* @param outputFile
* The path to output log file
* @param customLogHeader
* A custom string to add in the header of the output log file
* @param rFile
* The path to output r file
* @param evalTraceName
* The name of the eval trace
* @param rtlWithVariants
* true if variants of the reference termino should be kept during the eval
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline haeEval(String refFileURI, String outputFile, String customLogHeader, String rFile, String evalTraceName, boolean rtlWithVariants) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
EvalEngine.class,
EvalEngine.OUTPUT_LOG_FILE, outputFile,
EvalEngine.OUTPUT_R_FILE, rFile,
EvalEngine.CUSTOM_LOG_HEADER_STRING, customLogHeader,
// EvalEngine.LC_WITH_VARIANTS, extractedTerminoWithVariants,
EvalEngine.RTL_WITH_VARIANTS, rtlWithVariants
);
ExternalResourceFactory.bindResource(ae, resTermIndex());
ExternalResourceFactory.createDependencyAndBind(
ae,
EvalEngine.EVAL_TRACE,
EvalTrace.class,
evalTraceName);
ExternalResourceFactory.createDependencyAndBind(
ae,
EvalEngine.REFERENCE_LIST,
ReferenceTermList.class,
"file:" + refFileURI);
return aggregateAndReturn(ae, "Evaluating " + evalTraceName, 0);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Stores occurrences to MongoDB
*
* @param mongoDBUri
* the mongo db connection uri
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setMongoDBOccurrenceStore(String mongoDBUri) {
this.occurrenceStore = new MongoDBOccurrenceStore(mongoDBUri);
return this;
}
/**
* @deprecated Use TermSuitePipeline#setOccurrenceStoreMode instead.
*
* @param activate
* @return
* This chaining {@link TermSuitePipeline} builder object
*
*/
@Deprecated
public TermSuitePipeline setSpotWithOccurrences(boolean activate) {
this.spotWithOccurrences = activate;
return this;
}
/**
* Configures {@link RegexSpotter}. If true
,
* adds all spotted occurrences to the {@link TermIndex}.
*
* @see #aeRegexSpotter()
*
* @param addToTermIndex
* the value of the parameter
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setAddSpottedAnnoToTermIndex(boolean addToTermIndex) {
this.addSpottedAnnoToTermIndex = addToTermIndex;
return this;
}
/**
* Sets the post processing strategy for {@link RegexSpotter} analysis engine
*
* @see #aeRegexSpotter()
* @see OccurrenceBuffer#NO_CLEANING
* @see OccurrenceBuffer#KEEP_PREFIXES
* @see OccurrenceBuffer#KEEP_SUFFIXES
*
* @param postProcessingStrategy
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setPostProcessingStrategy(
String postProcessingStrategy) {
this.postProcessingStrategy = Optional.of(postProcessingStrategy);
return this;
}
/**
* Configures tsvExporter to (not) show headers on the
* first line.
*
* @param tsvWithHeaders
* the flag
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setTsvShowHeaders(boolean tsvWithHeaders) {
this.tsvWithHeaders = tsvWithHeaders;
return this;
}
/**
* Configures tsvExporter to (not) show variant scores with the
* "V" label
*
* @param tsvWithVariantScores
* the flag
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline setTsvShowScores(boolean tsvWithVariantScores) {
this.tsvWithVariantScores = tsvWithVariantScores;
return this;
}
public TermSuitePipeline haeJsonCasExporter(String toDirectoryPath ) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
JsonCasExporter.class,
JsonCasExporter.OUTPUT_DIRECTORY, toDirectoryPath
);
return aggregateAndReturn(ae, "Exporting CAS to JSON files", 0);
} catch(Exception e) {
throw new TermSuitePipelineException(e);
}
}
/**
*
* Configures the {@link JsonExporter} to not embed the occurrences
* in the json file, but to link the mongodb occurrence store instead.
*
*
*
* @see #haeJsonExporter(String)
* @return
* This chaining {@link TermSuitePipeline} builder object
*/
public TermSuitePipeline linkMongoStore() {
this.linkMongoStore = true;
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy