All Downloads are FREE. Search and download functionalities are using the official Maven repository.

chalk.uima.namefind.NameFinderTrainer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */ 

package chalk.uima.namefind;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;


import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

import nak.maxent.GIS;
import chalk.tools.cmdline.namefind.TokenNameFinderTrainerTool;
import chalk.tools.namefind.NameFinderME;
import chalk.tools.namefind.NameSample;
import chalk.tools.namefind.NameSampleDataStream;
import chalk.tools.namefind.TokenNameFinderModel;
import chalk.tools.util.ObjectStream;
import chalk.tools.util.ObjectStreamUtils;
import chalk.tools.util.PlainTextByLineStream;
import chalk.tools.util.Span;
import chalk.tools.util.TrainingParameters;
import chalk.uima.util.CasConsumerUtil;
import chalk.uima.util.ContainingConstraint;
import chalk.uima.util.OpennlpUtil;
import chalk.uima.util.SampleTraceStream;
import chalk.uima.util.UimaUtil;

/**
 * OpenNLP NameFinder trainer.
 * 

* Mandatory parameters *

* * * * * * *
Type Name Description
String opennlp.uima.ModelName The name of the model file
String opennlp.uima.Language The language code
String opennlp.uima.SentenceType The full name of the sentence type
String opennlp.uima.TokenType The full name of the token type
String opennlp.uima.NameType The full name of the name type
* * Optional parameters * * * * * * * * * *
Type Name Description
String opennlp.uima.opennlp.uima.TrainingParamsFile Training Parameters Properties file
String opennlp.uima.FeatureGeneratorFile Feature Generator definition file which contain the feature generator configuration
String opennlp.uima.FeatureGeneratorResources Feature Generator resources dictionary
String opennlp.uima.AdditionalTrainingDataFile Training file which contains additional data in the OpenNLP format
String opennlp.uima.AdditionalTrainingDataEncoding Encoding of the additional training data
String opennlp.uima.SampleTraceFile All training samples are traced to this file
String opennlp.uima.SampleTraceFileEncoding Encoding of the sample trace file
*

*/ public final class NameFinderTrainer extends CasConsumer_ImplBase { private static final String FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER = "opennlp.uima.FeatureGeneratorFile"; private static final String FEATURE_GENERATOR_RESOURCES_PARAMETER = "opennlp.uima.FeatureGeneratorResources"; private Logger logger; private String modelPath; private byte featureGeneratorDefinition[]; private File featureGeneratorResourceDir; private String additionalTrainingDataFile; private String additionalTrainingDataEncoding; private File sampleTraceFile = null; private String sampleTraceFileEncoding = null; private Type sentenceType; private Type tokenType; private Type nameType; private String language; // TODO: Keeping all events in memory limits the size of the training corpus // Possible solutions: // - Write all events to disk // - Directly start indexing with a blocking sample stream, the indexer will then write everything // to disk or could store the events much more space efficient in memory private List nameFinderSamples = new ArrayList(); private TrainingParameters trainingParams; /** * Initializes the current instance. */ public void initialize() throws ResourceInitializationException { super.initialize(); logger = getUimaContext().getLogger(); if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Initializing the OpenNLP Name Trainer."); } modelPath = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.MODEL_PARAMETER); language = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.LANGUAGE_PARAMETER); trainingParams = OpennlpUtil.loadTrainingParams(CasConsumerUtil.getOptionalStringParameter( getUimaContext(), UimaUtil.TRAINING_PARAMS_FILE_PARAMETER), true); String featureGeneratorDefinitionFile = CasConsumerUtil.getOptionalStringParameter( getUimaContext(), FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER); if (featureGeneratorDefinitionFile != null) { try { featureGeneratorDefinition = OpennlpUtil.loadBytes(new File(featureGeneratorDefinitionFile)); } catch (IOException e) { throw new ResourceInitializationException(e); } String featureGeneratorResourcesDirName = CasConsumerUtil.getOptionalStringParameter( getUimaContext(), FEATURE_GENERATOR_RESOURCES_PARAMETER); if (featureGeneratorResourcesDirName != null) { featureGeneratorResourceDir = new File(featureGeneratorResourcesDirName); } } additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter( getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE); // If the additional training data is specified, the encoding must be provided! if (additionalTrainingDataFile != null) { additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter( getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING); } String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter( getUimaContext(), "opennlp.uima.SampleTraceFile"); if (sampleTraceFileName != null) { sampleTraceFile = new File(getUimaContextAdmin().getResourceManager() .getDataPath() + File.separatorChar + sampleTraceFileName); sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter( getUimaContext(), "opennlp.uima.SampleTraceFileEncoding"); } } /** * Initialize the current instance with the given type system. */ public void typeSystemInit(TypeSystem typeSystem) throws ResourceInitializationException { String sentenceTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.SENTENCE_TYPE_PARAMETER); sentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.TOKEN_TYPE_PARAMETER); tokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); String nameTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), NameFinder.NAME_TYPE_PARAMETER); nameType = CasConsumerUtil.getType(typeSystem, nameTypeName); } /** * Creates a {@link List} from an {@link Iterator}. * * @param * @param it * @return */ private static List iteratorToList(Iterator it) { List list = new LinkedList(); while (it.hasNext()) { list.add(it.next()); } return list; } private static boolean isContaining(AnnotationFS annotation, AnnotationFS containtedAnnotation) { boolean isStartContaining = annotation.getBegin() <= containtedAnnotation .getBegin(); if (!isStartContaining) { return false; } boolean isEndContaining = annotation.getEnd() >= containtedAnnotation .getEnd(); if (!isEndContaining) { return false; } return true; } /** * Creates the name spans out of a list of token annotations and a list of entity annotations. *

* The name spans for the name finder use a token index and not on a character index which * is used by the entity annotations. * * @param tokenList * @param entityAnnotations * @return */ private static Span[] createNames(List tokenList, List entityAnnotations) { List nameList = new LinkedList(); AnnotationFS currentEntity = null; int startIndex = -1; int index = 0; for (AnnotationFS token : tokenList) { for (AnnotationFS entity : entityAnnotations) { if (!isContaining(entity, token)) { // ... end of an entity if (currentEntity == entity) { nameList.add(new Span(startIndex, index)); startIndex = -1; currentEntity = null; // break; } else { continue; } } // is this token start of new entity if (currentEntity == null && isContaining(entity, token)) { startIndex = index; currentEntity = entity; } } index++; } if (currentEntity != null) { Span name = new Span(startIndex, index); nameList.add(name); } return nameList.toArray(new Span[nameList.size()]); } /** * Process the given CAS object. */ /** * Process the given CAS object. */ public void processCas(CAS cas) { FSIndex sentenceIndex = cas.getAnnotationIndex(sentenceType); boolean isClearAdaptiveData = true; for (AnnotationFS sentenceAnnotation : sentenceIndex) { ContainingConstraint sentenceContainingConstraint = new ContainingConstraint( sentenceAnnotation); FSIndex tokenAnnotations = cas.getAnnotationIndex(tokenType); Iterator containingTokens = cas.createFilteredIterator(tokenAnnotations .iterator(), sentenceContainingConstraint); FSIndex allNames = cas.getAnnotationIndex(nameType); Iterator containingNames = cas.createFilteredIterator(allNames.iterator(), sentenceContainingConstraint); List tokenList = iteratorToList(containingTokens); Span names[] = createNames(tokenList, iteratorToList(containingNames)); // create token array String tokenArray[] = new String[tokenList.size()]; for (int i = 0; i < tokenArray.length; i++) { tokenArray[i] = tokenList.get(i).getCoveredText(); } NameSample trainingSentence = new NameSample(tokenArray, names, null, isClearAdaptiveData); if (trainingSentence.getSentence().length != 0) { nameFinderSamples.add(trainingSentence); if (isClearAdaptiveData) { isClearAdaptiveData = false; } } else { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Sentence without tokens: " + sentenceAnnotation.getCoveredText()); } } } } /** * Called if the processing is finished, this method * does the training. */ public void collectionProcessComplete(ProcessTrace trace) throws ResourceProcessException, IOException { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Collected " + nameFinderSamples.size() + " name samples."); } GIS.PRINT_MESSAGES = false; // create training stream ... ObjectStream samples = ObjectStreamUtils.createObjectStream(nameFinderSamples); InputStream additionalTrainingDataIn = null; Writer samplesOut = null; TokenNameFinderModel nameModel; try { if (additionalTrainingDataFile != null) { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Using additional training data file: " + additionalTrainingDataFile); } additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile); ObjectStream additionalSamples = new NameSampleDataStream( new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding))); samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); } if (sampleTraceFile != null) { samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding); samples = new SampleTraceStream(samples, samplesOut); } Map resourceMap; if (featureGeneratorResourceDir != null) { resourceMap = TokenNameFinderTrainerTool.loadResources(featureGeneratorResourceDir); } else { resourceMap = Collections.emptyMap(); } nameModel = NameFinderME.train(language, null, samples, trainingParams, featureGeneratorDefinition, resourceMap); } finally { if (additionalTrainingDataIn != null) { additionalTrainingDataIn.close(); } if (samplesOut != null) { samplesOut.close(); } } // dereference to allow garbage collection nameFinderSamples = null; File modelFile = new File(getUimaContextAdmin().getResourceManager() .getDataPath() + File.separatorChar + modelPath); OpennlpUtil.serialize(nameModel, modelFile); if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Model was written to: " + modelFile.getAbsolutePath()); } } /** * The trainer is not stateless. */ public boolean isStateless() { return false; } /** * Destroys the current instance. */ public void destroy() { // dereference to allow garbage collection nameFinderSamples = null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy