Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.chunker;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectStreamException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.util.BeamSearch;
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.Span;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
/**
* The class represents a maximum-entropy-based chunker. Such a chunker can be used to
* find flat structures based on sequence inputs such as noun phrases or named entities.
*/
public class ChunkerME implements Chunker {
public static final int DEFAULT_BEAM_SIZE = 10;
/**
* The beam used to search for sequences of chunk tag assignments.
*/
protected BeamSearch beam;
private Sequence bestSequence;
/**
* The model used to assign chunk tags to a sequence of tokens.
*/
protected MaxentModel model;
/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this chunker.
* @param cacheSize
* @param beamSize The size of the beam that should be used when decoding sequences.
* @param sequenceValidator The {@link SequenceValidator} to determines whether the outcome
* is valid for the preceding sequence. This can be used to implement constraints
* on what sequences are valid.
*/
public ChunkerME(ChunkerModel model, int beamSize, SequenceValidator sequenceValidator,
ChunkerContextGenerator contextGenerator) {
this.model = model.getChunkerModel();
beam = new BeamSearch(beamSize, contextGenerator, this.model, sequenceValidator, 0);
}
/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this chunker.
* @param beamSize The size of the beam that should be used when decoding sequences.
* @param sequenceValidator The {@link SequenceValidator} to determines whether the outcome
* is valid for the preceding sequence. This can be used to implement constraints
* on what sequences are valid.
*/
public ChunkerME(ChunkerModel model, int beamSize,
SequenceValidator sequenceValidator) {
this(model, beamSize, sequenceValidator,
new DefaultChunkerContextGenerator());
}
/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this chunker.
* @param cacheSize
* @param beamSize The size of the beam that should be used when decoding sequences.
*/
public ChunkerME(ChunkerModel model, int beamSize) {
this(model, beamSize, null);
}
/**
* Initializes the current instance with the specified model.
* The default beam size is used.
*
* @param model
*/
public ChunkerME(ChunkerModel model) {
this(model, DEFAULT_BEAM_SIZE);
}
/**
* Creates a chunker using the specified model.
*
* @param mod The maximum entropy model for this chunker.
*/
@Deprecated
public ChunkerME(MaxentModel mod) {
this(mod, new DefaultChunkerContextGenerator(), DEFAULT_BEAM_SIZE);
}
/**
* Creates a chunker using the specified model and context generator.
*
* @param mod The maximum entropy model for this chunker.
* @param cg The context generator to be used by the specified model.
*/
@Deprecated
public ChunkerME(MaxentModel mod, ChunkerContextGenerator cg) {
this(mod, cg, DEFAULT_BEAM_SIZE);
}
/**
* Creates a chunker using the specified model and context generator and decodes the
* model using a beam search of the specified size.
*
* @param mod The maximum entropy model for this chunker.
* @param cg The context generator to be used by the specified model.
* @param beamSize The size of the beam that should be used when decoding sequences.
*/
@Deprecated
public ChunkerME(MaxentModel mod, ChunkerContextGenerator cg, int beamSize) {
beam = new BeamSearch(beamSize, cg, mod);
this.model = mod;
}
@Deprecated
public List chunk(List toks, List tags) {
bestSequence =
beam.bestSequence(toks.toArray(new String[toks.size()]), new Object[] { (String[]) tags.toArray(new String[tags.size()]) });
return bestSequence.getOutcomes();
}
public String[] chunk(String[] toks, String[] tags) {
bestSequence = beam.bestSequence(toks, new Object[] {tags});
List c = bestSequence.getOutcomes();
return c.toArray(new String[c.size()]);
}
public Span[] chunkAsSpans(String[] toks, String[] tags) {
String[] preds = chunk(toks, tags);
return ChunkSample.phrasesAsSpanList(toks, tags, preds);
}
@Deprecated
public Sequence[] topKSequences(List sentence, List tags) {
return topKSequences(sentence.toArray(new String[sentence.size()]),
tags.toArray(new String[tags.size()]));
}
public Sequence[] topKSequences(String[] sentence, String[] tags) {
return beam.bestSequences(DEFAULT_BEAM_SIZE, sentence,
new Object[] { tags });
}
public Sequence[] topKSequences(String[] sentence, String[] tags, double minSequenceScore) {
return beam.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { tags },minSequenceScore);
}
/**
* Populates the specified array with the probabilities of the last decoded sequence. The
* sequence was determined based on the previous call to chunk. The
* specified array should be at least as large as the numbe of tokens in the previous call to chunk.
*
* @param probs An array used to hold the probabilities of the last decoded sequence.
*/
public void probs(double[] probs) {
bestSequence.getProbs(probs);
}
/**
* Returns an array with the probabilities of the last decoded sequence. The
* sequence was determined based on the previous call to chunk.
* @return An array with the same number of probabilities as tokens were sent to chunk
* when it was last called.
*/
public double[] probs() {
return bestSequence.getProbs();
}
public static ChunkerModel train(String lang, ObjectStream in,
int cutoff, int iterations, ChunkerContextGenerator contextGenerator)
throws IOException {
Map manifestInfoEntries = new HashMap();
ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
EventStream es = new ChunkerEventStream(in, contextGenerator);
HashSumEventStream hses = new HashSumEventStream(es);
AbstractModel maxentModel = opennlp.maxent.GIS.trainModel(iterations,
new TwoPassDataIndexer(hses, cutoff));
manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
hses.calculateHashSum().toString(16));
return new ChunkerModel(lang, maxentModel, manifestInfoEntries);
}
/**
* Trains a new model for the {@link ChunkerME}.
*
* @param es
* @param iterations
* @param cutoff
*
* @return the new model
*
* @throws IOException
*/
public static ChunkerModel train(String lang, ObjectStream in, int cutoff, int iterations)
throws IOException, ObjectStreamException {
return train(lang, in, cutoff, iterations, new DefaultChunkerContextGenerator());
}
@Deprecated
private static void usage() {
System.err.println("Usage: ChunkerME [-encoding charset] trainingFile modelFile");
System.err.println();
System.err.println("Training file should be one word per line where each line consists of a ");
System.err.println("space-delimited triple of \"word pos outcome\". Sentence breaks are indicated by blank lines.");
System.exit(1);
}
/**
* Trains the chunker using the specified parameters.
* Usage: ChunkerME trainingFile modelFile.
* Training file should be one word per line where each line consists of a
* space-delimited triple of "word pos outcome". Sentence breaks are indicated by blank lines.
* @param args The training file and the model file.
* @throws IOException When the specified files can not be read.
*/
@Deprecated
public static void main(String[] args) throws IOException, ObjectStreamException {
if (args.length == 0) {
usage();
}
int ai = 0;
String encoding = null;
while (args[ai].startsWith("-")) {
if (args[ai].equals("-encoding") && ai+1 < args.length) {
ai++;
encoding = args[ai];
}
else {
System.err.println("Unknown option: "+args[ai]);
usage();
}
ai++;
}
java.io.File inFile = null;
java.io.File outFile = null;
if (ai < args.length) {
inFile = new java.io.File(args[ai++]);
}
else {
usage();
}
if (ai < args.length) {
outFile = new java.io.File(args[ai++]);
}
else {
usage();
}
int iterations = 100;
int cutoff = 5;
if (args.length > ai) {
iterations = Integer.parseInt(args[ai++]);
}
if (args.length > ai) {
cutoff = Integer.parseInt(args[ai++]);
}
ChunkerModel mod;
ObjectStream es;
if (encoding != null) {
es = new ChunkSampleStream(new PlainTextByLineStream(new InputStreamReader(new FileInputStream(inFile),encoding)));
}
else {
es = new ChunkSampleStream(new PlainTextByLineStream(new java.io.FileReader(inFile)));
}
mod = train("en", es, cutoff, iterations);
System.out.println("Saving the model as: " + args[1]);
OutputStream out = new FileOutputStream(outFile);
mod.serialize(out);
out.close();
}
}