
fr.univnantes.termsuite.framework.PreprocessingPipelineBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package fr.univnantes.termsuite.framework;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.resource.ExternalResourceDescription;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.base.Preconditions;
import fr.free.rocheteau.jerome.engines.Stemmer;
import fr.univnantes.lina.uima.ChineseSegmenterResourceHelper;
import fr.univnantes.lina.uima.engines.ChineseSegmenter;
import fr.univnantes.lina.uima.engines.TreeTaggerWrapper;
import fr.univnantes.lina.uima.models.ChineseSegmentResource;
import fr.univnantes.lina.uima.models.TreeTaggerParameter;
import fr.univnantes.lina.uima.tkregex.ae.RegexListResource;
import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE;
import fr.univnantes.termsuite.api.ResourceConfig;
import fr.univnantes.termsuite.framework.service.TermSuiteResourceManager;
import fr.univnantes.termsuite.model.Lang;
import fr.univnantes.termsuite.model.Tagger;
import fr.univnantes.termsuite.types.FixedExpression;
import fr.univnantes.termsuite.uima.DocumentLogger;
import fr.univnantes.termsuite.uima.PipelineClosingAE;
import fr.univnantes.termsuite.uima.PipelineListener;
import fr.univnantes.termsuite.uima.PipelineListenerAE;
import fr.univnantes.termsuite.uima.PipelineResourceMgrs;
import fr.univnantes.termsuite.uima.PreparationPipelineException;
import fr.univnantes.termsuite.uima.ResourceType;
import fr.univnantes.termsuite.uima.engines.preproc.CasStatCounter;
import fr.univnantes.termsuite.uima.engines.preproc.FixedExpressionSpotter;
import fr.univnantes.termsuite.uima.engines.preproc.MateLemmaFixer;
import fr.univnantes.termsuite.uima.engines.preproc.MateLemmatizerTagger;
import fr.univnantes.termsuite.uima.engines.preproc.RegexSpotter;
import fr.univnantes.termsuite.uima.engines.preproc.StringRegexFilter;
import fr.univnantes.termsuite.uima.engines.preproc.TreeTaggerLemmaFixer;
import fr.univnantes.termsuite.uima.resources.preproc.CharacterFootprintTermFilter;
import fr.univnantes.termsuite.uima.resources.preproc.FixedExpressionResource;
import fr.univnantes.termsuite.uima.resources.preproc.MateLemmatizerModel;
import fr.univnantes.termsuite.uima.resources.preproc.MateTaggerModel;
import fr.univnantes.termsuite.utils.TermHistory;
import uima.sandbox.filter.resources.DefaultFilterResource;
import uima.sandbox.lexer.engines.Lexer;
import uima.sandbox.lexer.resources.SegmentBank;
import uima.sandbox.lexer.resources.SegmentBankResource;
import uima.sandbox.mapper.engines.Mapper;
import uima.sandbox.mapper.resources.Mapping;
import uima.sandbox.mapper.resources.MappingResource;
/**
* A builder and launcher for preparation pipeline.
*
* @author Damien Cram
*
*/
public class PreprocessingPipelineBuilder {
private final String pipelineId = UUID.randomUUID().toString();
/* The Logger */
private AggregateBuilder aggregateBuilder;
/* ******************************
* MAIN PIPELINE PARAMETERS
*/
private Lang lang;
private Optional nbDocuments = Optional.empty();
private Optional corpusSize = Optional.empty();
private List customAEs = new ArrayList<>();
private Path taggerPath;
private List userPipelineListeners = new ArrayList<>();
private ResourceConfig resourceConfig = new ResourceConfig();
private Tagger tagger = Tagger.TREE_TAGGER;
private boolean fixedExpressionEnabled = false;
private boolean documentLoggingEnabled = true;
/* *******************
* CONSTRUCTORS
*/
private PreprocessingPipelineBuilder(Lang lang, Path taggerPath) {
this.aggregateBuilder = new AggregateBuilder();
this.lang = lang;
this.taggerPath = taggerPath;
}
/**
*
* Starts a chaining {@link PreprocessingPipelineBuilder} builder.
*
* @param lang
* The
* @return
* The chaining builder.
*
*/
public static PreprocessingPipelineBuilder create(Lang lang, Path taggerPath) {
return new PreprocessingPipelineBuilder(lang, taggerPath);
}
/**
* Registers a pipeline listener.
*
* @param pipelineListener
* @return
* This chaining {@link PreprocessingPipelineBuilder} builder object
*/
public PreprocessingPipelineBuilder addPipelineListener(PipelineListener pipelineListener) {
this.userPipelineListeners.add(pipelineListener);
return this;
}
public void terminates() {
PipelineResourceMgrs.clearPipeline(pipelineId);
}
public AnalysisEngineDescription create() {
if(documentLoggingEnabled)
aeDocumentLogger(nbDocuments.orElse(0l), corpusSize.orElse(0l));
if(lang == Lang.ZH)
aeChineseTokenizer();
else
aeWordTokenizer();
switch (tagger) {
case TREE_TAGGER:
aeTreeTagger(taggerPath.toString());
break;
case MATE:
aeMateTaggerLemmatizer(taggerPath.toString());
default:
throw new UnsupportedOperationException("Unknown tagger: " + tagger);
}
aeUrlFilter();
aeStemmer();
aeRegexSpotter();
if(fixedExpressionEnabled)
aeFixedExpressionSpotter();
for(AnalysisEngineDescription ae:customAEs)
aggregateAndReturn(ae);
haeCasStatCounter("At end of preparation");
aePipelineListener();
// aePipelineClosingAE();
try {
return this.aggregateBuilder.createAggregateDescription();
} catch (ResourceInitializationException e) {
throw new PreparationPipelineException(e);
}
}
public PreprocessingPipelineBuilder setHistory(TermHistory history) {
PipelineResourceMgrs.getResourceMgr(pipelineId).register(TermHistory.class, history);
return this;
}
private PreprocessingPipelineBuilder aeWordTokenizer() {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
Lexer.class,
Lexer.PARAM_TYPE, "fr.univnantes.termsuite.types.WordAnnotation"
);
ExternalResourceDescription segmentBank = ExternalResourceFactory.createExternalResourceDescription(
SegmentBankResource.class,
getResourceURL(ResourceType.SEGMENT_BANK)
);
ExternalResourceFactory.bindResource(
ae,
SegmentBank.KEY_SEGMENT_BANK,
segmentBank);
return aggregateAndReturn(ae);
} catch (Exception e) {
throw new PreparationPipelineException(e);
}
}
private PreprocessingPipelineBuilder aggregateAndReturn(AnalysisEngineDescription ae) {
this.aggregateBuilder.add(ae);
return this;
}
private PreprocessingPipelineBuilder aeTreeTagger(String taggerPath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
TreeTaggerWrapper.class,
TreeTaggerWrapper.PARAM_ANNOTATION_TYPE, "fr.univnantes.termsuite.types.WordAnnotation",
TreeTaggerWrapper.PARAM_TAG_FEATURE, "tag",
TreeTaggerWrapper.PARAM_LEMMA_FEATURE, "lemma",
TreeTaggerWrapper.PARAM_UPDATE_ANNOTATION_FEATURES, true,
TreeTaggerWrapper.PARAM_TT_HOME_DIRECTORY, taggerPath
);
ExternalResourceDescription ttParam = ExternalResourceFactory.createExternalResourceDescription(
TreeTaggerParameter.class,
getResourceURL(ResourceType.TREETAGGER_CONFIG, Tagger.TREE_TAGGER)
);
ExternalResourceFactory.bindResource(
ae,
TreeTaggerParameter.KEY_TT_PARAMETER,
ttParam
);
return aggregateAndReturn(ae).ttLemmaFixer().ttNormalizer();
} catch (Exception e) {
throw new PreparationPipelineException(e);
}
}
public PreprocessingPipelineBuilder setResourceConfig(ResourceConfig resourceConfig) {
this.resourceConfig = resourceConfig;
return this;
}
public URL getResourceURL(ResourceType resourceType) {
return getResourceURL(resourceType, null);
}
public URL getResourceURL(ResourceType resourceType, Tagger tagger) {
for(URL urlPrefix:resourceConfig.getURLPrefixes()) {
URL candidateURL = resourceType.fromUrlPrefix(urlPrefix, lang, tagger);
if(TermSuiteResourceManager.resourceExists(resourceType, urlPrefix, candidateURL))
return candidateURL;
}
return resourceType.fromClasspath(lang, tagger);
}
public PreprocessingPipelineBuilder setMateModelPath(String path) {
Preconditions.checkArgument(Files.exists(Paths.get(path)), "Directory %s does not exist", path);
Preconditions.checkArgument(Files.isDirectory(Paths.get(path)), "File %s is not a directory", path);
return this;
}
private PreprocessingPipelineBuilder aeMateTaggerLemmatizer(String mateModelPath) {
try {
AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
MateLemmatizerTagger.class
);
String lemmatizerModel = Paths.get(mateModelPath, "mate-lemma-"+lang.getCode()+".model").toString();
String taggerModel = Paths.get(mateModelPath, "mate-pos-"+lang.getCode()+".model").toString();
Preconditions.checkArgument(Files.exists(Paths.get(lemmatizerModel)), "Lemmatizer model does not exist: %s", lemmatizerModel);
Preconditions.checkArgument(Files.exists(Paths.get(taggerModel)), "Tagger model does not exist: %s", taggerModel);
ExternalResourceFactory.createDependencyAndBind(
ae,
MateLemmatizerTagger.LEMMATIZER,
MateLemmatizerModel.class,
lemmatizerModel);
ExternalResourceFactory.createDependencyAndBind(
ae,
MateLemmatizerTagger.TAGGER,
MateTaggerModel.class,
taggerModel);
return aggregateAndReturn(ae)
.mateLemmaFixer()
.mateNormalizer();
} catch (Exception e) {
throw new PreparationPipelineException(e);
}
}
private void addParameters(AnalysisEngineDescription ae, Object... parameters) {
if(parameters.length % 2 == 1)
throw new IllegalArgumentException("Expecting even number of arguements for key-value pairs: " + parameters.length);
for(int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy