All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.uima.namefind.NameFinderTrainer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreemnets.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */ 

package opennlp.uima.namefind;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import opennlp.maxent.GIS;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.ContainingConstraint;
import opennlp.uima.util.OpennlpUtil;
import opennlp.uima.util.UimaUtil;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

/**
 * OpenNLP NameFinder trainer.
 * 

* Mandatory parameters *

* * * * * * *
Type Name Description
String opennlp.uima.ModelName The name of the model file
String opennlp.uima.Language The language code
String opennlp.uima.SentenceType The full name of the sentence type
String opennlp.uima.TokenType The full name of the token type
String opennlp.uima.NameType The full name of the name type
* * Optional parameters * * * * * * *
Type Name Description
String opennlp.uima.AdditionalTrainingDataFile Training file which contains additional data in the OpenNLP format
String opennlp.uima.AdditionalTrainingDataEncoding Encoding of the additional training data
Integer opennlp.uima.Cutoff (default=5)
Integer opennlp.uima.Iterations (default=100)
*

*/ public final class NameFinderTrainer extends CasConsumer_ImplBase { private Logger logger; private String modelPath; private String additionalTrainingDataFile; private String additionalTrainingDataEncoding; private Type sentenceType; private Type tokenType; private Type nameType; private String language; private int cutoff; private int iterations; // TODO: Keeping all events in memory limits the size of the training corpus // Possible solutions: // - Write all events to disk // - Directly start indexing with a blocking sample stream, the indexer will then write everything // to disk or could store the events much more space efficient in memory private List nameFinderSamples = new ArrayList(); /** * Initializes the current instance. */ public void initialize() throws ResourceInitializationException { super.initialize(); logger = getUimaContext().getLogger(); if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Initializing the OpenNLP Name Trainer."); } modelPath = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.MODEL_PARAMETER); language = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.LANGUAGE_PARAMETER); cutoff = CasConsumerUtil.getOptionalIntegerParameter(getUimaContext(), UimaUtil.CUTOFF_PARAMETER, 5); iterations = CasConsumerUtil.getOptionalIntegerParameter(getUimaContext(), UimaUtil.ITERATIONS_PARAMETER, 100); additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter( getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE); // If the additional training data is specified, the encoding must be provided! if (additionalTrainingDataFile != null) { additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter( getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING); } } /** * Initialize the current instance with the given type system. */ public void typeSystemInit(TypeSystem typeSystem) throws ResourceInitializationException { String sentenceTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.SENTENCE_TYPE_PARAMETER); sentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.TOKEN_TYPE_PARAMETER); tokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); String nameTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), NameFinder.NAME_TYPE_PARAMETER); nameType = CasConsumerUtil.getType(typeSystem, nameTypeName); } /** * Creates a {@link List} from an {@link Iterator}. * * @param * @param it * @return */ private static List iteratorToList(Iterator it) { List list = new LinkedList(); while (it.hasNext()) { list.add(it.next()); } return list; } private static boolean isContaining(AnnotationFS annotation, AnnotationFS containtedAnnotation) { boolean isStartContaining = annotation.getBegin() <= containtedAnnotation .getBegin(); if (!isStartContaining) { return false; } boolean isEndContaining = annotation.getEnd() >= containtedAnnotation .getEnd(); if (!isEndContaining) { return false; } return true; } /** * Creates the name spans out of a list of token annotations and a list of entity annotations. *

* The name spans for the name finder use a token index and not on a character index which * is used by the entity annotations. * * @param tokenList * @param entityAnnotations * @return */ private static Span[] createNames(List tokenList, List entityAnnotations) { List nameList = new LinkedList(); AnnotationFS currentEntity = null; int startIndex = -1; int index = 0; for (Iterator tokenIterator = tokenList.iterator(); tokenIterator.hasNext();) { AnnotationFS token = (AnnotationFS) tokenIterator.next(); for (Iterator it = entityAnnotations.iterator(); it.hasNext();) { AnnotationFS entity = (AnnotationFS) it.next(); if (!isContaining(entity, token)) { // ... end of an entity if (currentEntity == entity) { nameList.add(new Span(startIndex, index)); startIndex = -1; currentEntity = null; // break; } else { continue; } } // is this token start of new entity if (currentEntity == null && isContaining(entity, token)) { startIndex = index; currentEntity = entity; } } index++; } if (currentEntity != null) { Span name = new Span(startIndex, index); nameList.add(name); } return nameList.toArray(new Span[nameList.size()]); } /** * Process the given CAS object. */ public void processCas(CAS cas) { FSIndex sentenceIndex = cas.getAnnotationIndex(sentenceType); Iterator sentenceIterator = sentenceIndex.iterator(); while (sentenceIterator.hasNext()) { AnnotationFS sentenceAnnotation = sentenceIterator.next(); ContainingConstraint sentenceContainingConstraint = new ContainingConstraint( sentenceAnnotation); FSIndex tokenAnnotations = cas.getAnnotationIndex(tokenType); Iterator containingTokens = cas.createFilteredIterator(tokenAnnotations .iterator(), sentenceContainingConstraint); FSIndex allNames = cas.getAnnotationIndex(nameType); Iterator containingNames = cas.createFilteredIterator(allNames.iterator(), sentenceContainingConstraint); List tokenList = iteratorToList(containingTokens); Span names[] = createNames(tokenList, iteratorToList(containingNames)); // create token array String tokenArray[] = new String[tokenList.size()]; for (int i = 0; i < tokenArray.length; i++) { tokenArray[i] = ((AnnotationFS) tokenList.get(i)) .getCoveredText(); } NameSample traingSentence = new NameSample(tokenArray, names, null, false); if (traingSentence.getSentence().length != 0) { nameFinderSamples.add(traingSentence); } else { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Sentence without tokens: " + sentenceAnnotation.getCoveredText()); } } } } /** * Called if the processing is finished, this method * does the training. */ public void collectionProcessComplete(ProcessTrace trace) throws ResourceProcessException, IOException { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Collected " + nameFinderSamples.size() + " name samples."); } GIS.PRINT_MESSAGES = false; // create training stream ... ObjectStream samples = ObjectStreamUtils.createObjectStream(nameFinderSamples); InputStream additionalTrainingDataIn = null; TokenNameFinderModel nameModel; try { if (additionalTrainingDataFile != null) { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile); } additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile); // TODO: Make encoding configurable, otherwise use UTF-8 as default! ObjectStream additionalSamples = new NameSampleDataStream( new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding))); samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); } // TODO: Make sure its possible to pass custom feature generator // User could subclass this trainer to provide a custom feature generator nameModel = NameFinderME.train(language, null, samples, Collections.EMPTY_MAP, iterations, cutoff); } finally { if (additionalTrainingDataIn != null) additionalTrainingDataIn.close(); } // dereference to allow garbage collection nameFinderSamples = null; File modelFile = new File(getUimaContextAdmin().getResourceManager() .getDataPath() + File.separatorChar + modelPath); OpennlpUtil.serialize(nameModel, modelFile); if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Model was written to: " + modelFile.getAbsolutePath()); } } /** * The trainer is not stateless. */ public boolean isStateless() { return false; } /** * Destroys the current instance. */ public void destroy() { // dereference to allow garbage collection nameFinderSamples = null; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy