justhalf.nlp.reader.acereader.ACEReader Maven / Gradle / Ivy

Go to download
/**
 * 
 */
package justhalf.nlp.reader.acereader;

import static justhalf.nlp.reader.acereader.ACEDocument.unescape;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.xml.sax.SAXException;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.StringUtils;
import justhalf.nlp.postagger.POSTagger;
import justhalf.nlp.postagger.StanfordPOSTagger;
import justhalf.nlp.reader.acereader.ACEEntity.ACEEntitySubType;
import justhalf.nlp.reader.acereader.ACEEntity.ACEEntityType;
import justhalf.nlp.reader.acereader.ACEEvent.ACEEventSubType;
import justhalf.nlp.reader.acereader.ACEEvent.ACEEventType;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationSubType;
import justhalf.nlp.reader.acereader.ACERelation.ACERelationType;
import justhalf.nlp.reader.acereader.ACEValue.ACEValueSubType;
import justhalf.nlp.reader.acereader.ACEValue.ACEValueType;
import justhalf.nlp.sentencesplitter.SentenceSplitter;
import justhalf.nlp.sentencesplitter.StanfordSentenceSplitter;
import justhalf.nlp.tokenizer.RegexTokenizer;
import justhalf.nlp.tokenizer.StanfordTokenizer;
import justhalf.nlp.tokenizer.Tokenizer;

/**
 * The main class to read raw ACE documents as ACEDocuments objects.
 */
public class ACEReader {
	
	/** The complete list of domains in ACE 2004 */
	public static final List ACE2004_DOMAINS = Arrays.asList(new String[]{
			"arabic_treebank", "bnews", "chinese_treebank", "fisher_transcripts", "nwire"
	});
	
	/** The complete list of domains in ACE 2005 */
	public static final List ACE2005_DOMAINS = Arrays.asList(new String[]{
			"bc", "bn", "cts", "nw", "un", "wl"
	});
	
	public static void main(String[] args) throws FileNotFoundException{
		String ace2004DirName = null;
		String ace2005DirName = null;
		HashSet ace2004Domains = new LinkedHashSet(ACE2004_DOMAINS);
		HashSet ace2005Domains = new LinkedHashSet(ACE2005_DOMAINS);
		
		double[] datasplit = null;
		boolean convert = false;
		boolean convertEntities = false;
		String ace2004OutputDir = null;
		String ace2005OutputDir = null;
		
		boolean tokenize = false;
		boolean posTag = false;
		Tokenizer tokenizer = null;
		POSTagger posTagger = null;
		SentenceSplitter splitter = null;
		
		boolean toCoNLL = false;
		boolean ignoreOverlaps = false;
		boolean useBILOU = false;
		boolean splitByDocument = true;
		boolean shuffle = false;
		boolean excludeMetadata = false;
		int shuffleSeed = 31;
		
		int argIndex = 0;
		while(argIndex < args.length){
			switch(args[argIndex]){
			case "-ace2004Dir":
				ace2004DirName = args[argIndex+1];
				argIndex += 2;
				break;
			case "-ace2005Dir":
				ace2005DirName = args[argIndex+1];
				argIndex += 2;
				break;
			case "-ace2004IncludeDomains":
				ace2004Domains.clear();
				ace2004Domains.addAll(Arrays.asList(args[argIndex+1].split(",")));
				argIndex += 2;
				break;
			case "-ace2004ExcludeDomains":
				ace2004Domains.removeAll(Arrays.asList(args[argIndex+1].split(",")));
				argIndex += 2;
				break;
			case "-ace2005IncludeDomains":
				ace2005Domains.clear();
				ace2005Domains.addAll(Arrays.asList(args[argIndex+1].split(",")));
				argIndex += 2;
				break;
			case "-ace2005ExcludeDomains":
				ace2005Domains.removeAll(Arrays.asList(args[argIndex+1].split(",")));
				argIndex += 2;
				break;
			case "-excludeMetadata":
				excludeMetadata = true;
				argIndex += 1;
				break;
			case "-convertEntitiesToInline":
				convertEntities = true;
				convert = true;
				argIndex += 1;
				break;
			case "-dataSplit":
				String[] tokens = args[argIndex+1].split(",");
				datasplit = new double[3];
				double sum = 0;
				for(int i=0; i fileList = new ArrayList();
		if(ace2004DirName != null){
			extractDocList(fileList, ace2004DirName, ace2004Domains);
		}
		if(ace2005DirName != null){
			extractDocList(fileList, ace2005DirName, ace2005Domains, "/timex2norm");
		}
		
		String dataset = "";
		if(ace2004DirName != null){
			dataset += "ACE2004 ("+StringUtils.join(ace2004Domains, ",")+")";
		}
		if(ace2005DirName != null){
			if(dataset != ""){
				dataset += " and ";
			}
			dataset += "ACE2005 ("+StringUtils.join(ace2005Domains, ",")+")";
		}
		System.out.println("Extracting data from "+dataset);
		
		// Start reading data
		List ace2004Docs = new ArrayList();
		List ace2005Docs = new ArrayList();
		List docs = new ArrayList();
		int docCount = 0;
		int[] entityCount = new int[1];
		int[] entityMentionCount = new int[1];
		int[] valueCount = new int[1];
		int[] valueMentionCount = new int[1];
		int[] relationCount = new int[1];
		int[] relationMentionCount = new int[1];
		int[] eventCount = new int[1];
		int[] eventMentionCount = new int[1];
		int overlapCount = 0;
		int allLowercaseCount = 0;
		
		Map wordCountInMention = new HashMap();
		Map entityTypeCount = new HashMap();
		Map relationTypeCount = new HashMap();
		Map valueTypeCount = new HashMap();
		Map eventTypeCount = new HashMap();
		Map entityTypeMentionCount = new HashMap();
		Map relationTypeMentionCount = new HashMap();
		Map valueTypeMentionCount = new HashMap();
		Map eventTypeMentionCount = new HashMap();
		for(File sgmFile: fileList){
			try {
				ACEDocument doc = new ACEDocument(sgmFile.getAbsolutePath(), excludeMetadata);
				docCount++;
//				printMentions(doc, doc.mentions);
				
				// Count mentions and objects
				count(doc, doc.entities, entityTypeCount, entityTypeMentionCount, entityCount, entityMentionCount);
				count(doc, doc.relations, relationTypeCount, relationTypeMentionCount, relationCount, relationMentionCount);
				count(doc, doc.values, valueTypeCount, valueTypeMentionCount, valueCount, valueMentionCount);
				count(doc, doc.events, eventTypeCount, eventTypeMentionCount, eventCount, eventMentionCount);
				
				// Count mention overlaps
				for(int i=0; i 0){
				System.out.println("Printing ACE2004 dataset to "+ace2004OutputDir+"/{train,dev,test}.data");
				printDataset(ace2004OutputDir, ace2004Docs, datasplit, convertEntities,
						(tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null, splitter,
								toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed);
			}
			if(ace2005Docs.size() > 0){
				System.out.println("Printing ACE2005 dataset to "+ace2005OutputDir+"/{train,dev,test}.data");
				printDataset(ace2005OutputDir, ace2005Docs, datasplit, convertEntities,
						(tokenize || toCoNLL) ? tokenizer : null, posTag ? posTagger : null,
								splitter, toCoNLL, ignoreOverlaps, useBILOU, splitByDocument, shuffle, shuffleSeed);
			}
		}
	}

	private static void printDataset(String outputDir, List docs, double[] datasplit,
			boolean convertEntities, Tokenizer tokenizer, POSTagger posTagger, SentenceSplitter splitter,
			boolean toCoNLL, boolean ignoreOverlaps, boolean useBILOU, boolean splitByDocument,
			boolean shuffle, int shuffleSeed) throws FileNotFoundException {
		List trainSentences = new ArrayList();
		List devSentences = new ArrayList();
		List testSentences = new ArrayList();
		if(splitByDocument){
			List trainDocs = new ArrayList();
			List devDocs = new ArrayList();
			List testDocs = new ArrayList();
			splitData(docs, trainDocs, devDocs, testDocs, datasplit, shuffle, shuffleSeed);
			trainSentences = getSentences(trainDocs, splitter, ignoreOverlaps);
			devSentences = getSentences(devDocs, splitter, ignoreOverlaps);
			testSentences = getSentences(testDocs, splitter, ignoreOverlaps);
		} else {
			List aceSentences = getSentences(docs, splitter, ignoreOverlaps);
			trainSentences = new ArrayList();
			devSentences = new ArrayList();
			testSentences = new ArrayList();
			splitData(aceSentences, trainSentences, devSentences, testSentences, datasplit, shuffle, shuffleSeed);
		}
		writeData(trainSentences, outputDir, "/train.data", tokenizer, posTagger, toCoNLL, useBILOU);
		writeData(devSentences, outputDir, "/dev.data", tokenizer, posTagger, toCoNLL, useBILOU);
		writeData(testSentences, outputDir, "/test.data", tokenizer, posTagger, toCoNLL, useBILOU);
	}

	/**
	 * Split documents into sentences with their corresponding annotations (entities, relations,
	 * events, timexes, values)
	 * @param docs The list of ACEDocument to be split
	 * @param splitter The sentence splitter
	 * @param ignoreOverlappingEntities Whether to ignore overlapping entities by removing the
	 * 									shorter one when there is an overlap.
	 * @return
	 */
	public static List getSentences(List docs, SentenceSplitter splitter,
			boolean ignoreOverlappingEntities) {
		List aceSentences = new ArrayList();
		for(ACEDocument doc: docs){
			for(CoreLabel sentence: fixSplit(splitter.split(doc.text))){
				Span sentenceSpan = new Span(sentence.beginPosition(), sentence.endPosition());
				ACESentence aceSentence = new ACESentence(doc, sentenceSpan, sentence.value());
				for(ACEEntityMention mention: doc.entityMentions){
					if(sentenceSpan.contains(mention.span)){
						ACEEntityMention newMention = new ACEEntityMention(mention);
						newMention.span.start -= sentenceSpan.start;
						newMention.span.end -= sentenceSpan.start;
						newMention.headSpan.start -= sentenceSpan.start;
						newMention.headSpan.end -= sentenceSpan.start;

						boolean add = true;
						if(ignoreOverlappingEntities){
							for(int i=aceSentence.entities.size()-1; i >= 0; i--){
								ACEEntityMention existingMention = aceSentence.entities.get(i);
								if(newMention.overlapsWith(existingMention)){
									if(newMention.span.length() > existingMention.span.length()){
										aceSentence.entities.remove(i);
									} else {
										add = false;
										break;
									}
								}
							}
						}
						if(add){
							aceSentence.addEntityMention(newMention);
						}
					}
				}
				for(ACERelationMention relation: doc.relationMentions){
					if(sentenceSpan.contains(relation.span)){
						aceSentence.addRelationMention(relation);
					}
				}
				for(ACEEventMention event: doc.eventMentions){
					if(sentenceSpan.contains(event.span)){
						aceSentence.addEventMention(event);
					}
				}
				for(ACETimexMention timex: doc.timexMentions){
					if(sentenceSpan.contains(timex.span)){
						aceSentence.addTimexMention(timex);
					}
				}
				for(ACEValueMention value: doc.valueMentions){
					if(sentenceSpan.contains(value.span)){
						aceSentence.addValueMention(value);
					}
				}
				aceSentences.add(aceSentence);
			}
		}
		return aceSentences;
	}
	
	private static List fixSplit(List sentences){
		List result = new ArrayList();
		for(int i=0; i fixTokens(List tokens){
		List result = new ArrayList();
		for(int i=0; i void splitData(List aceObjects, List trainObjects, List devObjects,
			List testObjects, double[] datasplit, boolean shuffle, int shuffleSeed){
		int total = aceObjects.size();
		int trainSize = (int)(datasplit[0]*total);
		int devSize = (int)(datasplit[1]*total);
		int testSize = (int)(datasplit[2]*total);
		if(trainSize + devSize + testSize != total){
			trainSize -= trainSize + devSize + testSize - total;
		}
		List tmpObjects = new ArrayList();
		tmpObjects.addAll(aceObjects);
		if(shuffle){
			Collections.shuffle(tmpObjects, new Random(shuffleSeed));
		}
		trainObjects.addAll(tmpObjects.subList(0, trainSize));
		devObjects.addAll(tmpObjects.subList(trainSize, trainSize + devSize));
		testObjects.addAll(tmpObjects.subList(trainSize + devSize, total));
		String typeName = tmpObjects.get(0).getClass().getName();
		typeName = typeName.substring(typeName.lastIndexOf(".")+1);
		System.out.println("Number of objects ("+typeName+"):");
		System.out.println("Training: "+trainObjects.size());
		System.out.println("Dev: "+devObjects.size());
		System.out.println("Test: "+testObjects.size());
	}
	
	private static void printStatistics(List sentences){
		Map counts = new HashMap();
		for(ACESentence sentence: sentences){
			for(ACEEntityMention entity: sentence.entities){
				if(!counts.containsKey(entity.entity.type.name())){
					counts.put(entity.entity.type.name(), 0);
				}
				counts.put(entity.entity.type.name(), counts.get(entity.entity.type.name())+1);
			}
		}
		System.out.println("Statistics:");
		for(String type: sorted(counts.keySet())){
			System.out.println(type.toString()+": "+counts.get(type));
		}
	}
	
	private static > List sorted(Collection coll){
		List result = new ArrayList();
		result.addAll(coll);
		Collections.sort(result);
		return result;
	}
	
	private static void writeData(List sentences, String outputDir, String name,
			Tokenizer tokenizer, POSTagger posTagger,
			boolean toCoNLL, boolean useBILOU) throws FileNotFoundException{
		PrintWriter printer = new PrintWriter(new File(outputDir+name));
		for(ACESentence sentence: sentences){
			if(tokenizer != null){
				List tokens = fixTokens(tokenizer.tokenize(sentence.text));
				if(posTagger != null){
					posTagger.tagCoreLabels(tokens);
				}
				if(toCoNLL){
					List outputTokens = spansToLabels(sentence.entities, tokens, useBILOU);
					if(posTagger != null){
						for(int i=0; i 0){
							stringBuilder.append(" ");
						}
						stringBuilder.append(token.value());
						token.setWord(escapeBracket(token.word()));
					}
					printer.println(stringBuilder.toString());
					if(posTagger != null){
						stringBuilder = new StringBuilder();
						for(CoreLabel token: tokens){
							if(stringBuilder.length() > 0){
								stringBuilder.append(" ");
							}
							stringBuilder.append(token.tag());
						}
						printer.println(stringBuilder.toString());
					}
					stringBuilder = new StringBuilder();
					for(ACEEntityMention mention: sentence.entities){
						Span wordSpan = findWordSpan(mention.span, tokens);
						Span headWordSpan = findWordSpan(mention.headSpan, tokens);
						if(stringBuilder.length() > 0){
							stringBuilder.append("|");
						}
						stringBuilder.append(String.format("%s,%s,%s,%s %s", wordSpan.start, wordSpan.end, headWordSpan.start, headWordSpan.end, mention.label.form));
					}
					printer.println(stringBuilder.toString());
					printer.println();
				}
			} else {
				printer.println(sentence.text.replaceAll("[\n\t]", " "));
				StringBuilder stringBuilder = new StringBuilder();
				for(ACEEntityMention mention: sentence.entities){
					Span span = mention.span;
					Span headSpan = mention.headSpan;
					if(stringBuilder.length() > 0){
						stringBuilder.append("|");
					}
					stringBuilder.append(String.format("%s,%s,%s,%s %s", span.start, span.end, headSpan.start, headSpan.end, mention.label.form));
				}
				printer.println(stringBuilder.toString());
				printer.println();
			}
		}
		printer.close();
		printStatistics(sentences);
	}
	
	private static List spansToLabels(List mentions, List tokens, boolean useBILOU){
		WordLabel[] result = new WordLabel[tokens.size()];
		Arrays.fill(result, null);
		for(ACEEntityMention mention: mentions){
			Span span = findWordSpan(mention.span, tokens);
			String type = mention.label.form;
			for(int i=span.start; i tokens){
		int start = -1;
		int end = -1;
		for(int i=0; i mention.start && start == -1){
				start = i;
			}
			if(token.beginPosition() < mention.end && token.endPosition() >= mention.end){
				end = i+1;
			}
		}
		if(start == -1 || end == -1){
			System.out.println("Mention ["+mention.start+","+mention.end+"] not found in ["+tokens.get(0).beginPosition()+","+tokens.get(tokens.size()-1).endPosition()+"]");
			System.out.print("[");
			for(CoreLabel token: tokens){
				System.out.print(token.value()+"("+token.beginPosition()+","+token.endPosition()+") ");
			}
			System.out.println("]");
		}
		return new Span(start, end);
	}
	
	/**
	 * Reads directories containing ACE 2004 and/or ACE 2005 data and return them as {@link ACEDocument} objects.
	 * @param ace2004DirName The path to ACE 2004 directory. Can be null.
	 * @param ace2005DirName The path to ACE 2005 directory. Can be null.
	 * @return
	 * @throws IOException
	 * @throws SAXException
	 */
	public static List readDocuments(String ace2004DirName, String ace2005DirName) throws IOException, SAXException{
		return readDocuments(ace2004DirName, ace2005DirName, ACE2004_DOMAINS, ACE2005_DOMAINS);
	}
	
	/**
	 * Reads directories containing ACE 2004 and/or ACE 2005 data and return them as {@link ACEDocument} objects.
	 * @param ace2004DirName The path to ACE 2004 directory. Can be null.
	 * @param ace2005DirName The path to ACE 2005 directory. Can be null.
	 * @param ace2004Domains The list of domains for ACE 2004 to be included.
	 * @param ace2005Domains The list of domains for ACE 2005 to be included.
	 * @return
	 * @throws IOException
	 * @throws SAXException
	 */
	public static List readDocuments(String ace2004DirName, String ace2005DirName, String[] ace2004Domains, String[] ace2005Domains) throws IOException, SAXException{
		return readDocuments(ace2004DirName, ace2005DirName, Arrays.asList(ace2004Domains), Arrays.asList(ace2005Domains));
	}
	
	/**
	 * Reads directories containing ACE 2004 and/or ACE 2005 data and return them as {@link ACEDocument} objects.
	 * @param ace2004DirName The path to ACE 2004 directory. Can be null.
	 * @param ace2005DirName The path to ACE 2005 directory. Can be null.
	 * @param ace2004Domains The list of domains for ACE 2004 to be included.
	 * @param ace2005Domains The list of domains for ACE 2005 to be included.
	 * @return
	 * @throws IOException
	 * @throws SAXException
	 */
	public static List readDocuments(String ace2004DirName, String ace2005DirName, List ace2004Domains, List ace2005Domains) throws IOException, SAXException{
		List result = new ArrayList();
		List fileList = new ArrayList();
		if(ace2004DirName != null){
			extractDocList(fileList, ace2004DirName, ace2004Domains);
		}
		if(ace2005DirName != null){
			extractDocList(fileList, ace2005DirName, ace2005Domains, "/timex2norm");
		}
		for(File sgmFile: fileList){
			result.add(new ACEDocument(sgmFile.getAbsolutePath()));
		}
		return result;
	}

	private static void extractDocList(List fileList, String aceDirName, Collection aceDomains, String... additionalPath) {
		File aceDir = new File(aceDirName);
		for(File subdir: aceDir.listFiles()){
			if(!subdir.isDirectory()){
				continue;
			}
			if(!aceDomains.contains(subdir.getName())) continue;
			if(additionalPath.length > 0){
				subdir = new File(subdir.getAbsolutePath()+additionalPath[0]);
			}
			for(File sgmFile: subdir.listFiles()){
				if(!sgmFile.getName().endsWith(".sgm")){
					continue;
				}
				fileList.add(sgmFile);
			}
		}
	}
	
	private static void count(ACEDocument doc, List objects, Map objectCountMap, Map mentionCountMap, int[] objectCount, int[] mentionCount){
		for(ACEObject object: objects){
			if(object.mentions().isEmpty() && object.type() != ACERelation.ACERelationType.METONYMY){
				System.out.println("Non-metonymy empty mention set at "+doc.uri+": "+object.id);
			}
			int count = objectCountMap.getOrDefault(object.type(), 0);
			objectCountMap.put(object.type(), count+1);
			count = objectCountMap.getOrDefault(object.subtype(),  0);
			objectCountMap.put(object.subtype(), count+1);
			count = mentionCountMap.getOrDefault(object.type(), 0);
			// Metonymy relations do not have mentions
			mentionCountMap.put(object.type(), count+Math.max(1, object.mentions().size()));
			count = mentionCountMap.getOrDefault(object.subtype(),  0);
			mentionCountMap.put(object.subtype(), count+Math.max(1, object.mentions().size()));
			objectCount[0] += 1;
			mentionCount[0] += object.mentions().size();
			for(ACEObjectMention mention: object.mentions()){
				if(!mention.text.equals(unescape(mention.getText(doc.text)))){
					System.err.println("===TEXT===");
					System.err.println(doc.text);
					System.err.println("===FULL TEXT===");
					System.err.println(doc.fullText);
					System.err.println("===SGM===");
					System.err.println(doc.uri);
					System.err.println("===TEXT LENGTH===");
					System.err.println(doc.text.length());
					System.err.println("===OFFSET===");
					System.err.println(doc.offset);
					System.err.println("===MENTION===");
					System.err.println(mention.text);
					System.err.println(mention.span);
					throw new RuntimeException(mention.text+" != "+unescape(mention.getText(doc.text)));
				}
			}
		}
	}
	
	private static void printHelp(){
		printHelp(null);
	}
	
	private static void printHelp(String message){
		if(message != null){
			System.out.println(message);
			System.out.println();
		}
		System.out.println(
				"Usage: java -jar acereader-0.1.jar -ace2004Dir  -ace2005Dir \n"
				+ "\t[-ace2004IncludeDomains (arabic_treebank,bnews,chinese_treebank,fisher_transcripts,nwire)]\n"
				+ "\t[-ace2004ExcludeDomains (arabic_treebank,bnews,chinese_treebank,fisher_transcripts,nwire)]\n"
				+ "\t[-ace2005IncludeDomains (bc,bn,cts,nw,un,bl)]\n"
				+ "\t[-ace2005ExcludeDomains (bc,bn,cts,nw,un,bl)]\n"
				+ "\t[-convertEntitiesToInline]\n"
				+ "\t[-ace2004OutputBasePath]\n"
				+ "\t[-ace2005OutputBasePath]\n"
				+ "\t[-dataSplit ]\n"
				+ "\t[-tokenizer (stanford|regex)]\n"
				+ "\t[-posTagger (stanford)]\n"
				+ "\t[-splitter (stanford)]\n"
				+ "\t[-toCoNLLFormat]\n"
				+ "\t[-ignoreOverlaps]\n"
				+ "\t[-useBILOU]\n"
				+ "\t[-splitBySentences]\n"
				
				+ "\n"
				
				+ "-ace2004Dir \n"
				+ "\tPath to ACE2004 directory containing the domain subdirectories.\n"
				
				+ "\n"
				
				+ "-ace2005Dir \n"
				+ "\tPath to ACE2004 directory containing the domain subdirectories.\n"
				+ "\tOnly the data from timex2norm version will be used.\n"
				
				+ "\n"
				
				+ "-ace2004{Include,Exclude}Domains \n"
				+ "\tTo include/exclude certain domains from ACE2004.\n"
				+ "\tOnly one of -ace2004IncludeDomains -ace2004ExcludeDomains will take effect.\n"
				+ "\tIf -ace2004IncludeDomains is specified, only those domains will be included.\n"
				+ "\tIf -ace2004ExcludeDomains is specified, all except those domains will be included.\n"
				+ "\tPut a subset of these separated by comma:\n"
				+ "\t- arabic_treebank\n"
				+ "\t- bnews\n"
				+ "\t- chinese_treebank\n"
				+ "\t- fisher_transcripts\n"
				+ "\t- nwire\n"
				
				+ "\n"
				
				+ "-ace2005{Include,Exclude}Domains \n"
				+ "\tTo include/exclude certain domains from ACE2005.\n"
				+ "\tOnly one of -ace2005IncludeDomains -ace2005ExcludeDomains will take effect.\n"
				+ "\tIf -ace2005IncludeDomains is specified, only those domains will be included.\n"
				+ "\tIf -ace2005ExcludeDomains is specified, all except those domains will be included.\n"
				+ "\tPut a subset of these separated by comma:\n"
				+ "\t- bc\n"
				+ "\t- bn\n"
				+ "\t- cts\n"
				+ "\t- nw\n"
				+ "\t- un\n"
				+ "\t- bl\n"
				
				+ "\n"
				
				+ "-excludeMetadata\n"
				+ "\tExclude the text that comes before the  tag, which includes date and article ID.\n"
				
				+ "\n"
				
				+ "-convertEntitiesToInline\n"
				+ "\tPrint the entities into files.\n"
				+ "\tNeed -ace2004OutputBasePath, -ace2005OutputBasePath, and -dataSplit options.\n"
				
				+ "\n"
				
				+ "-ace{2004,2005}OutputDir \n"
				+ "\tThe directory for ACE2004 and ACE2005 inline output.\n"
				
				+ "\n"
				
				+ "-dataSplit \n"
				+ "\tSplit into multiple files according to the ratio given.\n"
				+ "\tYou can give two (train+test) or three (train+dev+test) values.\n"
				+ "\tExamples:\n"
				+ "\t-dataSplit 90,10 to split into 90% training and 10% test\n"
				+ "\t-dataSplit 0.8,0.1,0.1 to split into 80% training, 10% dev, and 10% test\n"
				
				+ "\n"
				
				+ "-tokenizer (stanford,regex)\n"
				+ "\tIf specified, the sentences will be tokenized, and the spans will be token-based.\n"
				+ "\tCurrently there are two tokenizers supported: Stanford and regex-based.\n"
				
				+ "\n"
				
				+ "-posTagger (stanford)\n"
				+ "\tIf specified, the output files will contain POS tags.\n"
				+ "\tCurrently only Stanford POS tagger is supported.\n"
				
				+ "\n"
				
				+ "-splitter (stanford)\n"
				+ "\tThe sentence splitter to split the data.\n"
				+ "\tCurrently only Stanford Splitter is supported.\n"
				
				+ "\n"
				
				+ "-toCoNLLFormat\n"
				+ "\tOutput conversion in CoNLL format.\n"
				
				+ "\n"
				
				+ "-ignoreOverlaps\n"
				+ "\tIgnore overlapping entities by removing the shorter entity in an overlap.\n"
				
				+ "\n"
				
				+ "-useBILOU\n"
				+ "\tTo use BILOU (Begin, Inside, Last, Outside, Unit) format instead of BIO.\n"
				+ "\tOnly applicable when -toCoNLLFormat is used.\n"
				
				+ "\n"
				
				+ "-splitBySentences\n"
				+ "\tSplit into training, development, and test based on sentences instead of documents.\n"
				
				+ "\n"
				
				+ "-shuffle\n"
				+ "\tWhen splitting dataset, shuffle the order.\n"
				
				+ "\n"
				
				+ "-seed \n"
				+ "\tThe seed used to initialize the Random object used to shuffle the dataset.\n"
				
				);
		if(message != null){
			System.out.println("===");
			System.out.println(message);
		}
	}
}