justhalf.nlp.reader.acereader.ACEReader Maven / Gradle / Ivy
/**
*
*/
package justhalf.nlp.reader.acereader;
import static justhalf.nlp.reader.acereader.ACEDocument.unescape;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.xml.sax.SAXException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.StringUtils;
import justhalf.nlp.postagger.POSTagger;
import justhalf.nlp.postagger.StanfordPOSTagger;
import justhalf.nlp.reader.acereader.ACEEntity.ACEEntitySubType;
import justhalf.nlp.reader.acereader.ACEEntity.ACEEntityType;
import justhalf.nlp.reader.acereader.ACEEvent.ACEEventSubType;
import justhalf.nlp.reader.acereader.ACEEvent.ACEEventType;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationSubType;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationType;
import justhalf.nlp.reader.acereader.ACEValue.ACEValueSubType;
import justhalf.nlp.reader.acereader.ACEValue.ACEValueType;
import justhalf.nlp.sentencesplitter.SentenceSplitter;
import justhalf.nlp.sentencesplitter.StanfordSentenceSplitter;
import justhalf.nlp.tokenizer.RegexTokenizer;
import justhalf.nlp.tokenizer.StanfordTokenizer;
import justhalf.nlp.tokenizer.Tokenizer;
/**
* The main class to read raw ACE documents as ACEDocuments objects.
*/
public class ACEReader {
/** The complete list of domains in ACE 2004 */
public static final List ACE2004_DOMAINS = Arrays.asList(new String[]{
"arabic_treebank", "bnews", "chinese_treebank", "fisher_transcripts", "nwire"
});
/** The complete list of domains in ACE 2005 */
public static final List ACE2005_DOMAINS = Arrays.asList(new String[]{
"bc", "bn", "cts", "nw", "un", "wl"
});
public static void main(String[] args) throws FileNotFoundException{
String ace2004DirName = null;
String ace2005DirName = null;
HashSet ace2004Domains = new LinkedHashSet(ACE2004_DOMAINS);
HashSet ace2005Domains = new LinkedHashSet(ACE2005_DOMAINS);
double[] datasplit = null;
boolean convert = false;
boolean convertEntities = false;
String ace2004OutputDir = null;
String ace2005OutputDir = null;
boolean tokenize = false;
boolean posTag = false;
Tokenizer tokenizer = null;
POSTagger posTagger = null;
SentenceSplitter splitter = null;
boolean toCoNLL = false;
boolean ignoreOverlaps = false;
boolean useBILOU = false;
boolean splitByDocument = true;
boolean shuffle = false;
boolean excludeMetadata = false;
int shuffleSeed = 31;
int argIndex = 0;
while(argIndex < args.length){
switch(args[argIndex]){
case "-ace2004Dir":
ace2004DirName = args[argIndex+1];
argIndex += 2;
break;
case "-ace2005Dir":
ace2005DirName = args[argIndex+1];
argIndex += 2;
break;
case "-ace2004IncludeDomains":
ace2004Domains.clear();
ace2004Domains.addAll(Arrays.asList(args[argIndex+1].split(",")));
argIndex += 2;
break;
case "-ace2004ExcludeDomains":
ace2004Domains.removeAll(Arrays.asList(args[argIndex+1].split(",")));
argIndex += 2;
break;
case "-ace2005IncludeDomains":
ace2005Domains.clear();
ace2005Domains.addAll(Arrays.asList(args[argIndex+1].split(",")));
argIndex += 2;
break;
case "-ace2005ExcludeDomains":
ace2005Domains.removeAll(Arrays.asList(args[argIndex+1].split(",")));
argIndex += 2;
break;
case "-excludeMetadata":
excludeMetadata = true;
argIndex += 1;
break;
case "-convertEntitiesToInline":
convertEntities = true;
convert = true;
argIndex += 1;
break;
case "-dataSplit":
String[] tokens = args[argIndex+1].split(",");
datasplit = new double[3];
double sum = 0;
for(int i=0; i fileList = new ArrayList();
if(ace2004DirName != null){
extractDocList(fileList, ace2004DirName, ace2004Domains);
}
if(ace2005DirName != null){
extractDocList(fileList, ace2005DirName, ace2005Domains, "/timex2norm");
}
String dataset = "";
if(ace2004DirName != null){
dataset += "ACE2004 ("+StringUtils.join(ace2004Domains, ",")+")";
}
if(ace2005DirName != null){
if(dataset != ""){
dataset += " and ";
}
dataset += "ACE2005 ("+StringUtils.join(ace2005Domains, ",")+")";
}
System.out.println("Extracting data from "+dataset);
// Start reading data
List ace2004Docs = new ArrayList();
List ace2005Docs = new ArrayList();
List docs = new ArrayList();
int docCount = 0;
int[] entityCount = new int[1];
int[] entityMentionCount = new int[1];
int[] valueCount = new int[1];
int[] valueMentionCount = new int[1];
int[] relationCount = new int[1];
int[] relationMentionCount = new int[1];
int[] eventCount = new int[1];
int[] eventMentionCount = new int[1];
int overlapCount = 0;
int allLowercaseCount = 0;
Map wordCountInMention = new HashMap();
Map entityTypeCount = new HashMap();
Map relationTypeCount = new HashMap();
Map valueTypeCount = new HashMap();
Map eventTypeCount = new HashMap();
Map entityTypeMentionCount = new HashMap();
Map relationTypeMentionCount = new HashMap();
Map valueTypeMentionCount = new HashMap();
Map eventTypeMentionCount = new HashMap();
for(File sgmFile: fileList){
try {
ACEDocument doc = new ACEDocument(sgmFile.getAbsolutePath(), excludeMetadata);
docCount++;
// printMentions(doc, doc.mentions);
// Count mentions and objects
count(doc, doc.entities, entityTypeCount, entityTypeMentionCount, entityCount, entityMentionCount);
count(doc, doc.relations, relationTypeCount, relationTypeMentionCount, relationCount, relationMentionCount);
count(doc, doc.values, valueTypeCount, valueTypeMentionCount, valueCount, valueMentionCount);
count(doc, doc.events, eventTypeCount, eventTypeMentionCount, eventCount, eventMentionCount);
// Count mention overlaps
for(int i=0; i 0){
System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data");
printDataset(ace2004OutputDir, ace2004Docs, datasplit, convertEntities,
(tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, splitter,
toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed);
}
if(ace2005Docs.size() > 0){
System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data");
printDataset(ace2005OutputDir, ace2005Docs, datasplit, convertEntities,
(tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null,
splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed);
}
}
}
private static void printDataset(String outputDir, List docs, double[] datasplit,
boolean convertEntities, Tokenizer tokenizer, POSTagger posTagger, SentenceSplitter splitter,
boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, boolean splitByDocument,
boolean shuffle, int shuffleSeed) throws FileNotFoundException {
List trainSentences = new ArrayList();
List devSentences = new ArrayList();
List testSentences = new ArrayList();
if(splitByDocument){
List trainDocs = new ArrayList();
List devDocs = new ArrayList();
List testDocs = new ArrayList();
splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed);
trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps);
devSentences = getSentences(devDocs, splitter, ignoreOverlaps);
testSentences = getSentences(testDocs, splitter, ignoreOverlaps);
} else {
List aceSentences = getSentences(docs, splitter, ignoreOverlaps);
trainSentences = new ArrayList();
devSentences = new ArrayList();
testSentences = new ArrayList();
splitData(aceSentences, trainSentences, devSentences, testSentences, datasplit, shuffle, shuffleSeed);
}
writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, toCoNLL, useBILOU);
writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, toCoNLL, useBILOU);
writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, toCoNLL, useBILOU);
}
/**
* Split documents into sentences with their corresponding annotations (entities, relations,
* events, timexes, values)
* @param docs The list of ACEDocument to be split
* @param splitter The sentence splitter
* @param ignoreOverlappingEntities Whether to ignore overlapping entities by removing the
* shorter one when there is an overlap.
* @return
*/
public static List getSentences(List docs, SentenceSplitter splitter,
boolean ignoreOverlappingEntities) {
List aceSentences = new ArrayList();
for(ACEDocument doc: docs){
for(CoreLabel sentence: fixSplit(splitter.split(doc.text))){
Span sentenceSpan = new Span(sentence.beginPosition(), sentence.endPosition());
ACESentence aceSentence = new ACESentence(doc, sentenceSpan, sentence.value());
for(ACEEntityMention mention: doc.entityMentions){
if(sentenceSpan.contains(mention.span)){
ACEEntityMention newMention = new ACEEntityMention(mention);
newMention.span.start -= sentenceSpan.start;
newMention.span.end -= sentenceSpan.start;
newMention.headSpan.start -= sentenceSpan.start;
newMention.headSpan.end -= sentenceSpan.start;
boolean add = true;
if(ignoreOverlappingEntities){
for(int i=aceSentence.entities.size()-1; i >= 0; i--){
ACEEntityMention existingMention = aceSentence.entities.get(i);
if(newMention.overlapsWith(existingMention)){
if(newMention.span.length() > existingMention.span.length()){
aceSentence.entities.remove(i);
} else {
add = false;
break;
}
}
}
}
if(add){
aceSentence.addEntityMention(newMention);
}
}
}
for(ACERelationMention relation: doc.relationMentions){
if(sentenceSpan.contains(relation.span)){
aceSentence.addRelationMention(relation);
}
}
for(ACEEventMention event: doc.eventMentions){
if(sentenceSpan.contains(event.span)){
aceSentence.addEventMention(event);
}
}
for(ACETimexMention timex: doc.timexMentions){
if(sentenceSpan.contains(timex.span)){
aceSentence.addTimexMention(timex);
}
}
for(ACEValueMention value: doc.valueMentions){
if(sentenceSpan.contains(value.span)){
aceSentence.addValueMention(value);
}
}
aceSentences.add(aceSentence);
}
}
return aceSentences;
}
private static List fixSplit(List sentences){
List result = new ArrayList();
for(int i=0; i fixTokens(List tokens){
List result = new ArrayList();
for(int i=0; i void splitData(List aceObjects, List trainObjects, List devObjects,
List testObjects, double[] datasplit, boolean shuffle, int shuffleSeed){
int total = aceObjects.size();
int trainSize = (int)(datasplit[0]*total);
int devSize = (int)(datasplit[1]*total);
int testSize = (int)(datasplit[2]*total);
if(trainSize + devSize + testSize != total){
trainSize -= trainSize + devSize + testSize - total;
}
List tmpObjects = new ArrayList();
tmpObjects.addAll(aceObjects);
if(shuffle){
Collections.shuffle(tmpObjects, new Random(shuffleSeed));
}
trainObjects.addAll(tmpObjects.subList(0, trainSize));
devObjects.addAll(tmpObjects.subList(trainSize, trainSize + devSize));
testObjects.addAll(tmpObjects.subList(trainSize + devSize, total));
String typeName = tmpObjects.get(0).getClass().getName();
typeName = typeName.substring(typeName.lastIndexOf(".")+1);
System.out.println("Number of objects ("+typeName+"):");
System.out.println("Training: "+trainObjects.size());
System.out.println("Dev: "+devObjects.size());
System.out.println("Test: "+testObjects.size());
}
private static void printStatistics(List sentences){
Map counts = new HashMap();
for(ACESentence sentence: sentences){
for(ACEEntityMention entity: sentence.entities){
if(!counts.containsKey(entity.entity.type.name())){
counts.put(entity.entity.type.name(), 0);
}
counts.put(entity.entity.type.name(), counts.get(entity.entity.type.name())+1);
}
}
System.out.println("Statistics:");
for(String type: sorted(counts.keySet())){
System.out.println(type.toString()+": "+counts.get(type));
}
}
private static > List sorted(Collection coll){
List result = new ArrayList();
result.addAll(coll);
Collections.sort(result);
return result;
}
private static void writeData(List sentences, String outputDir, String name,
Tokenizer tokenizer, POSTagger posTagger,
boolean toCoNLL, boolean useBILOU) throws FileNotFoundException{
PrintWriter printer = new PrintWriter(new File(outputDir+name));
for(ACESentence sentence: sentences){
if(tokenizer != null){
List tokens = fixTokens(tokenizer.tokenize(sentence.text));
if(posTagger != null){
posTagger.tagCoreLabels(tokens);
}
if(toCoNLL){
List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU);
if(posTagger != null){
for(int i=0; i 0){
stringBuilder.append(" ");
}
stringBuilder.append(token.value());
token.setWord(escapeBracket(token.word()));
}
printer.println(stringBuilder.toString());
if(posTagger != null){
stringBuilder = new StringBuilder();
for(CoreLabel token: tokens){
if(stringBuilder.length() > 0){
stringBuilder.append(" ");
}
stringBuilder.append(token.tag());
}
printer.println(stringBuilder.toString());
}
stringBuilder = new StringBuilder();
for(ACEEntityMention mention: sentence.entities){
Span wordSpan = findWordSpan(mention.span, tokens);
Span headWordSpan = findWordSpan(mention.headSpan, tokens);
if(stringBuilder.length() > 0){
stringBuilder.append("|");
}
stringBuilder.append(String.format("%s,%s,%s,%s %s", wordSpan.start, wordSpan.end, headWordSpan.start, headWordSpan.end, mention.label.form));
}
printer.println(stringBuilder.toString());
printer.println();
}
} else {
printer.println(sentence.text.replaceAll("[\n\t]", " "));
StringBuilder stringBuilder = new StringBuilder();
for(ACEEntityMention mention: sentence.entities){
Span span = mention.span;
Span headSpan = mention.headSpan;
if(stringBuilder.length() > 0){
stringBuilder.append("|");
}
stringBuilder.append(String.format("%s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form));
}
printer.println(stringBuilder.toString());
printer.println();
}
}
printer.close();
printStatistics(sentences);
}
private static List spansToLabels(List mentions, List tokens, boolean useBILOU){
WordLabel[] result = new WordLabel[tokens.size()];
Arrays.fill(result, null);
for(ACEEntityMention mention: mentions){
Span span = findWordSpan(mention.span, tokens);
String type = mention.label.form;
for(int i=span.start; i tokens){
int start = -1;
int end = -1;
for(int i=0; i mention.start && start == -1){
start = i;
}
if(token.beginPosition() < mention.end && token.endPosition() >= mention.end){
end = i+1;
}
}
if(start == -1 || end == -1){
System.out.println("Mention ["+mention.start+","+mention.end+"] not found in ["+tokens.get(0).beginPosition()+","+tokens.get(tokens.size()-1).endPosition()+"]");
System.out.print("[");
for(CoreLabel token: tokens){
System.out.print(token.value()+"("+token.beginPosition()+","+token.endPosition()+") ");
}
System.out.println("]");
}
return new Span(start, end);
}
/**
* Reads directories containing ACE 2004 and/or ACE 2005 data and return them as {@link ACEDocument} objects.
* @param ace2004DirName The path to ACE 2004 directory. Can be null.
* @param ace2005DirName The path to ACE 2005 directory. Can be null.
* @return
* @throws IOException
* @throws SAXException
*/
public static List readDocuments(String ace2004DirName, String ace2005DirName) throws IOException, SAXException{
return readDocuments(ace2004DirName, ace2005DirName, ACE2004_DOMAINS, ACE2005_DOMAINS);
}
/**
* Reads directories containing ACE 2004 and/or ACE 2005 data and return them as {@link ACEDocument} objects.
* @param ace2004DirName The path to ACE 2004 directory. Can be null.
* @param ace2005DirName The path to ACE 2005 directory. Can be null.
* @param ace2004Domains The list of domains for ACE 2004 to be included.
* @param ace2005Domains The list of domains for ACE 2005 to be included.
* @return
* @throws IOException
* @throws SAXException
*/
public static List readDocuments(String ace2004DirName, String ace2005DirName, String[] ace2004Domains, String[] ace2005Domains) throws IOException, SAXException{
return readDocuments(ace2004DirName, ace2005DirName, Arrays.asList(ace2004Domains), Arrays.asList(ace2005Domains));
}
/**
* Reads directories containing ACE 2004 and/or ACE 2005 data and return them as {@link ACEDocument} objects.
* @param ace2004DirName The path to ACE 2004 directory. Can be null.
* @param ace2005DirName The path to ACE 2005 directory. Can be null.
* @param ace2004Domains The list of domains for ACE 2004 to be included.
* @param ace2005Domains The list of domains for ACE 2005 to be included.
* @return
* @throws IOException
* @throws SAXException
*/
public static List readDocuments(String ace2004DirName, String ace2005DirName, List ace2004Domains, List ace2005Domains) throws IOException, SAXException{
List result = new ArrayList();
List fileList = new ArrayList();
if(ace2004DirName != null){
extractDocList(fileList, ace2004DirName, ace2004Domains);
}
if(ace2005DirName != null){
extractDocList(fileList, ace2005DirName, ace2005Domains, "/timex2norm");
}
for(File sgmFile: fileList){
result.add(new ACEDocument(sgmFile.getAbsolutePath()));
}
return result;
}
private static void extractDocList(List fileList, String aceDirName, Collection aceDomains, String... additionalPath) {
File aceDir = new File(aceDirName);
for(File subdir: aceDir.listFiles()){
if(!subdir.isDirectory()){
continue;
}
if(!aceDomains.contains(subdir.getName())) continue;
if(additionalPath.length > 0){
subdir = new File(subdir.getAbsolutePath()+additionalPath[0]);
}
for(File sgmFile: subdir.listFiles()){
if(!sgmFile.getName().endsWith(".sgm")){
continue;
}
fileList.add(sgmFile);
}
}
}
private static void count(ACEDocument doc, List extends ACEObject> objects, Map extends ACEEventArgumentType, Integer> objectCountMap, Map extends ACEEventArgumentType, Integer> mentionCountMap, int[] objectCount, int[] mentionCount){
for(ACEObject object: objects){
if(object.mentions().isEmpty() && object.type() != ACERelation.ACERelationType.METONYMY){
System.out.println("Non-metonymy empty mention set at "+doc.uri+": "+object.id);
}
int count = objectCountMap.getOrDefault(object.type(), 0);
objectCountMap.put(object.type(), count+1);
count = objectCountMap.getOrDefault(object.subtype(), 0);
objectCountMap.put(object.subtype(), count+1);
count = mentionCountMap.getOrDefault(object.type(), 0);
// Metonymy relations do not have mentions
mentionCountMap.put(object.type(), count+Math.max(1, object.mentions().size()));
count = mentionCountMap.getOrDefault(object.subtype(), 0);
mentionCountMap.put(object.subtype(), count+Math.max(1, object.mentions().size()));
objectCount[0] += 1;
mentionCount[0] += object.mentions().size();
for(ACEObjectMention> mention: object.mentions()){
if(!mention.text.equals(unescape(mention.getText(doc.text)))){
System.err.println("===TEXT===");
System.err.println(doc.text);
System.err.println("===FULL TEXT===");
System.err.println(doc.fullText);
System.err.println("===SGM===");
System.err.println(doc.uri);
System.err.println("===TEXT LENGTH===");
System.err.println(doc.text.length());
System.err.println("===OFFSET===");
System.err.println(doc.offset);
System.err.println("===MENTION===");
System.err.println(mention.text);
System.err.println(mention.span);
throw new RuntimeException(mention.text+" != "+unescape(mention.getText(doc.text)));
}
}
}
}
private static void printHelp(){
printHelp(null);
}
private static void printHelp(String message){
if(message != null){
System.out.println(message);
System.out.println();
}
System.out.println(
"Usage: java -jar acereader-0.1.jar -ace2004Dir -ace2005Dir \n"
+ "\t[-ace2004IncludeDomains (arabic_treebank,bnews,chinese_treebank,fisher_transcripts,nwire)]\n"
+ "\t[-ace2004ExcludeDomains (arabic_treebank,bnews,chinese_treebank,fisher_transcripts,nwire)]\n"
+ "\t[-ace2005IncludeDomains (bc,bn,cts,nw,un,bl)]\n"
+ "\t[-ace2005ExcludeDomains (bc,bn,cts,nw,un,bl)]\n"
+ "\t[-convertEntitiesToInline]\n"
+ "\t[-ace2004OutputBasePath]\n"
+ "\t[-ace2005OutputBasePath]\n"
+ "\t[-dataSplit ]\n"
+ "\t[-tokenizer (stanford|regex)]\n"
+ "\t[-posTagger (stanford)]\n"
+ "\t[-splitter (stanford)]\n"
+ "\t[-toCoNLLFormat]\n"
+ "\t[-ignoreOverlaps]\n"
+ "\t[-useBILOU]\n"
+ "\t[-splitBySentences]\n"
+ "\n"
+ "-ace2004Dir \n"
+ "\tPath to ACE2004 directory containing the domain subdirectories.\n"
+ "\n"
+ "-ace2005Dir \n"
+ "\tPath to ACE2004 directory containing the domain subdirectories.\n"
+ "\tOnly the data from timex2norm version will be used.\n"
+ "\n"
+ "-ace2004{Include,Exclude}Domains \n"
+ "\tTo include/exclude certain domains from ACE2004.\n"
+ "\tOnly one of -ace2004IncludeDomains -ace2004ExcludeDomains will take effect.\n"
+ "\tIf -ace2004IncludeDomains is specified, only those domains will be included.\n"
+ "\tIf -ace2004ExcludeDomains is specified, all except those domains will be included.\n"
+ "\tPut a subset of these separated by comma:\n"
+ "\t- arabic_treebank\n"
+ "\t- bnews\n"
+ "\t- chinese_treebank\n"
+ "\t- fisher_transcripts\n"
+ "\t- nwire\n"
+ "\n"
+ "-ace2005{Include,Exclude}Domains \n"
+ "\tTo include/exclude certain domains from ACE2005.\n"
+ "\tOnly one of -ace2005IncludeDomains -ace2005ExcludeDomains will take effect.\n"
+ "\tIf -ace2005IncludeDomains is specified, only those domains will be included.\n"
+ "\tIf -ace2005ExcludeDomains is specified, all except those domains will be included.\n"
+ "\tPut a subset of these separated by comma:\n"
+ "\t- bc\n"
+ "\t- bn\n"
+ "\t- cts\n"
+ "\t- nw\n"
+ "\t- un\n"
+ "\t- bl\n"
+ "\n"
+ "-excludeMetadata\n"
+ "\tExclude the text that comes before the tag, which includes date and article ID.\n"
+ "\n"
+ "-convertEntitiesToInline\n"
+ "\tPrint the entities into files.\n"
+ "\tNeed -ace2004OutputBasePath, -ace2005OutputBasePath, and -dataSplit options.\n"
+ "\n"
+ "-ace{2004,2005}OutputDir \n"
+ "\tThe directory for ACE2004 and ACE2005 inline output.\n"
+ "\n"
+ "-dataSplit \n"
+ "\tSplit into multiple files according to the ratio given.\n"
+ "\tYou can give two (train+test) or three (train+dev+test) values.\n"
+ "\tExamples:\n"
+ "\t-dataSplit 90,10 to split into 90% training and 10% test\n"
+ "\t-dataSplit 0.8,0.1,0.1 to split into 80% training, 10% dev, and 10% test\n"
+ "\n"
+ "-tokenizer (stanford,regex)\n"
+ "\tIf specified, the sentences will be tokenized, and the spans will be token-based.\n"
+ "\tCurrently there are two tokenizers supported: Stanford and regex-based.\n"
+ "\n"
+ "-posTagger (stanford)\n"
+ "\tIf specified, the output files will contain POS tags.\n"
+ "\tCurrently only Stanford POS tagger is supported.\n"
+ "\n"
+ "-splitter (stanford)\n"
+ "\tThe sentence splitter to split the data.\n"
+ "\tCurrently only Stanford Splitter is supported.\n"
+ "\n"
+ "-toCoNLLFormat\n"
+ "\tOutput conversion in CoNLL format.\n"
+ "\n"
+ "-ignoreOverlaps\n"
+ "\tIgnore overlapping entities by removing the shorter entity in an overlap.\n"
+ "\n"
+ "-useBILOU\n"
+ "\tTo use BILOU (Begin, Inside, Last, Outside, Unit) format instead of BIO.\n"
+ "\tOnly applicable when -toCoNLLFormat is used.\n"
+ "\n"
+ "-splitBySentences\n"
+ "\tSplit into training, development, and test based on sentences instead of documents.\n"
+ "\n"
+ "-shuffle\n"
+ "\tWhen splitting dataset, shuffle the order.\n"
+ "\n"
+ "-seed \n"
+ "\tThe seed used to initialize the Random object used to shuffle the dataset.\n"
);
if(message != null){
System.out.println("===");
System.out.println(message);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy