fr.univnantes.termsuite.framework.PreprocessingPipelineBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
There is a newer version: 3.0.10
/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package fr.univnantes.termsuite.framework;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.UUID;

import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.resource.ExternalResourceDescription;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.base.Preconditions;

import fr.free.rocheteau.jerome.engines.Stemmer;
import fr.univnantes.lina.uima.ChineseSegmenterResourceHelper;
import fr.univnantes.lina.uima.engines.ChineseSegmenter;
import fr.univnantes.lina.uima.engines.TreeTaggerWrapper;
import fr.univnantes.lina.uima.models.ChineseSegmentResource;
import fr.univnantes.lina.uima.models.TreeTaggerParameter;
import fr.univnantes.lina.uima.tkregex.ae.RegexListResource;
import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE;
import fr.univnantes.termsuite.api.ResourceConfig;
import fr.univnantes.termsuite.framework.service.TermSuiteResourceManager;
import fr.univnantes.termsuite.model.Lang;
import fr.univnantes.termsuite.model.Tagger;
import fr.univnantes.termsuite.types.FixedExpression;
import fr.univnantes.termsuite.uima.DocumentLogger;
import fr.univnantes.termsuite.uima.PipelineClosingAE;
import fr.univnantes.termsuite.uima.PipelineListener;
import fr.univnantes.termsuite.uima.PipelineListenerAE;
import fr.univnantes.termsuite.uima.PipelineResourceMgrs;
import fr.univnantes.termsuite.uima.PreparationPipelineException;
import fr.univnantes.termsuite.uima.ResourceType;
import fr.univnantes.termsuite.uima.engines.preproc.CasStatCounter;
import fr.univnantes.termsuite.uima.engines.preproc.FixedExpressionSpotter;
import fr.univnantes.termsuite.uima.engines.preproc.MateLemmaFixer;
import fr.univnantes.termsuite.uima.engines.preproc.MateLemmatizerTagger;
import fr.univnantes.termsuite.uima.engines.preproc.RegexSpotter;
import fr.univnantes.termsuite.uima.engines.preproc.StringRegexFilter;
import fr.univnantes.termsuite.uima.engines.preproc.TreeTaggerLemmaFixer;
import fr.univnantes.termsuite.uima.resources.preproc.CharacterFootprintTermFilter;
import fr.univnantes.termsuite.uima.resources.preproc.FixedExpressionResource;
import fr.univnantes.termsuite.uima.resources.preproc.MateLemmatizerModel;
import fr.univnantes.termsuite.uima.resources.preproc.MateTaggerModel;
import fr.univnantes.termsuite.utils.TermHistory;
import uima.sandbox.filter.resources.DefaultFilterResource;
import uima.sandbox.lexer.engines.Lexer;
import uima.sandbox.lexer.resources.SegmentBank;
import uima.sandbox.lexer.resources.SegmentBankResource;
import uima.sandbox.mapper.engines.Mapper;
import uima.sandbox.mapper.resources.Mapping;
import uima.sandbox.mapper.resources.MappingResource;

/**
 * A builder and launcher for preparation pipeline.
 *  
 * @author Damien Cram
 *
 */
public class PreprocessingPipelineBuilder {
	private final String pipelineId = UUID.randomUUID().toString();

	/* The Logger */
	private AggregateBuilder aggregateBuilder;
	
	/* ******************************
	 * MAIN PIPELINE PARAMETERS
	 */
	private Lang lang;
	private Optional nbDocuments = Optional.empty();
	private Optional corpusSize = Optional.empty();
	private List customAEs = new ArrayList<>();
	private Path taggerPath;
	private List userPipelineListeners = new ArrayList<>();
	private ResourceConfig resourceConfig = new ResourceConfig();
	private Tagger tagger = Tagger.TREE_TAGGER;
	private boolean fixedExpressionEnabled = false;
	private boolean documentLoggingEnabled = true;

	
	/* *******************
	 * CONSTRUCTORS
	 */
	private PreprocessingPipelineBuilder(Lang lang, Path taggerPath) {
		this.aggregateBuilder = new AggregateBuilder();
		this.lang = lang;
		this.taggerPath = taggerPath;
	}


	/**
	 * 
	 * Starts a chaining {@link PreprocessingPipelineBuilder} builder. 
	 * 
	 * @param lang
	 * 			The 
	 * @return
	 * 			The chaining builder.
	 * 
	 */
	public static PreprocessingPipelineBuilder create(Lang lang, Path taggerPath) {
		return new PreprocessingPipelineBuilder(lang, taggerPath);
	}
	

	/**
	 * Registers a pipeline listener.
	 * 
	 * @param pipelineListener
	 * @return
	 * 		This chaining {@link PreprocessingPipelineBuilder} builder object
	 */
	public PreprocessingPipelineBuilder addPipelineListener(PipelineListener pipelineListener) {
		this.userPipelineListeners.add(pipelineListener);
		return this;
	}
	
	public void terminates() {
		PipelineResourceMgrs.clearPipeline(pipelineId);
	}

	public AnalysisEngineDescription create() {
		if(documentLoggingEnabled)
			aeDocumentLogger(nbDocuments.orElse(0l), corpusSize.orElse(0l));
		
		if(lang == Lang.ZH)
			aeChineseTokenizer();
		else
			aeWordTokenizer();
		
		switch (tagger) {
		case TREE_TAGGER:
			aeTreeTagger(taggerPath.toString());
			break;
		case MATE:
			aeMateTaggerLemmatizer(taggerPath.toString());
		default:
			throw new UnsupportedOperationException("Unknown tagger: " + tagger);
		}
		
		aeUrlFilter();
		
		aeStemmer();
		
		aeRegexSpotter();
		
		if(fixedExpressionEnabled) 
			aeFixedExpressionSpotter();
		
		for(AnalysisEngineDescription ae:customAEs)
			aggregateAndReturn(ae);
		
		
		haeCasStatCounter("At end of preparation");
		
		aePipelineListener();

//		aePipelineClosingAE();
		
		try {
			return this.aggregateBuilder.createAggregateDescription();
		} catch (ResourceInitializationException e) {
			throw new PreparationPipelineException(e);
		}
		
	}
	

	public PreprocessingPipelineBuilder setHistory(TermHistory history) {
		PipelineResourceMgrs.getResourceMgr(pipelineId).register(TermHistory.class, history);
		return this;
	}

	private PreprocessingPipelineBuilder aeWordTokenizer() {
		try {
			AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
					Lexer.class, 
					Lexer.PARAM_TYPE, "fr.univnantes.termsuite.types.WordAnnotation"
				);
			
			ExternalResourceDescription	segmentBank = ExternalResourceFactory.createExternalResourceDescription(
					SegmentBankResource.class,
					getResourceURL(ResourceType.SEGMENT_BANK)
				);
			

					
			ExternalResourceFactory.bindResource(
					ae, 
					SegmentBank.KEY_SEGMENT_BANK, 
					segmentBank);

			return aggregateAndReturn(ae);
		} catch (Exception e) {
			throw new PreparationPipelineException(e);
		}
		
	}

	private PreprocessingPipelineBuilder aggregateAndReturn(AnalysisEngineDescription ae) {
		this.aggregateBuilder.add(ae);
		return this;
	}

	private PreprocessingPipelineBuilder aeTreeTagger(String taggerPath) {
		try {
			AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
					TreeTaggerWrapper.class, 
					TreeTaggerWrapper.PARAM_ANNOTATION_TYPE, "fr.univnantes.termsuite.types.WordAnnotation",
					TreeTaggerWrapper.PARAM_TAG_FEATURE, "tag",
					TreeTaggerWrapper.PARAM_LEMMA_FEATURE, "lemma",
					TreeTaggerWrapper.PARAM_UPDATE_ANNOTATION_FEATURES, true,
					TreeTaggerWrapper.PARAM_TT_HOME_DIRECTORY, taggerPath
				);
			
			ExternalResourceDescription ttParam = ExternalResourceFactory.createExternalResourceDescription(
					TreeTaggerParameter.class,
					getResourceURL(ResourceType.TREETAGGER_CONFIG, Tagger.TREE_TAGGER)
				);
			
			ExternalResourceFactory.bindResource(
					ae,
					TreeTaggerParameter.KEY_TT_PARAMETER, 
					ttParam 
				);

			return aggregateAndReturn(ae).ttLemmaFixer().ttNormalizer();
		} catch (Exception e) {
			throw new PreparationPipelineException(e);
		}
	}




	
	public PreprocessingPipelineBuilder setResourceConfig(ResourceConfig resourceConfig) {
		this.resourceConfig = resourceConfig;
		return this;
	}
	
	public URL getResourceURL(ResourceType resourceType) {
		return getResourceURL(resourceType, null);
	}
	
	public URL getResourceURL(ResourceType resourceType, Tagger tagger) {
		for(URL urlPrefix:resourceConfig.getURLPrefixes()) {
			URL candidateURL = resourceType.fromUrlPrefix(urlPrefix, lang, tagger);
			if(TermSuiteResourceManager.resourceExists(resourceType, urlPrefix, candidateURL))
				return candidateURL;
		}
		return resourceType.fromClasspath(lang, tagger);
	}


	public PreprocessingPipelineBuilder setMateModelPath(String path) {
		Preconditions.checkArgument(Files.exists(Paths.get(path)), "Directory %s does not exist", path);
		Preconditions.checkArgument(Files.isDirectory(Paths.get(path)), "File %s is not a directory", path);
		return this;
	}

	
	private PreprocessingPipelineBuilder aeMateTaggerLemmatizer(String mateModelPath)  {
		try {
			AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription(
					MateLemmatizerTagger.class
				);
			
			String lemmatizerModel = Paths.get(mateModelPath, "mate-lemma-"+lang.getCode()+".model").toString();
			String taggerModel = Paths.get(mateModelPath, "mate-pos-"+lang.getCode()+".model").toString();
			Preconditions.checkArgument(Files.exists(Paths.get(lemmatizerModel)), "Lemmatizer model does not exist: %s", lemmatizerModel);
			Preconditions.checkArgument(Files.exists(Paths.get(taggerModel)), "Tagger model does not exist: %s", taggerModel);
	
			ExternalResourceFactory.createDependencyAndBind(
					ae,
					MateLemmatizerTagger.LEMMATIZER, 
					MateLemmatizerModel.class, 
					lemmatizerModel);
			ExternalResourceFactory.createDependencyAndBind(
					ae,
					MateLemmatizerTagger.TAGGER, 
					MateTaggerModel.class, 
					taggerModel);
	
			return aggregateAndReturn(ae)
					.mateLemmaFixer()
					.mateNormalizer();
		} catch (Exception e) {
			throw new PreparationPipelineException(e);
		}
	}
	
	private void addParameters(AnalysisEngineDescription ae, Object... parameters) {
		if(parameters.length % 2 == 1)
			throw new IllegalArgumentException("Expecting even number of arguements for key-value pairs: " + parameters.length);
		for(int i=0; i