edu.stanford.nlp.trees.treebank.ConfigParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.trees.treebank;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
/**
*
* @author Spence Green
*
*/
public class ConfigParser implements Iterable {
//The parameter names and delimiter
private static final String DELIM = "=";
public static final String paramName = "NAME"; //Name of the dataset
public static final String paramPath = "PATH"; //Path to files in the dataset
public static final String paramOutputPath = "OUTPUT_PATH"; // Where to output the results
public static final String paramSplit = "SPLIT"; //A file listing filenames in a split
public static final String paramEncode = "OUTPUT_ENCODING"; //Preferred output encoding [Buckwalter | UTF8]
public static final String paramMapping = "MAPPING"; //Path to an LDC-format POS tag mapping file
public static final String paramDistrib = "DISTRIB"; //Add to distribution or not [true | false]
public static final String paramType = "TYPE"; //Specify the Dataset type to use
public static final String paramFlat = "FLAT"; //Output terminals only
public static final String paramDT = "USEDET"; //Add a determiner to the Bies tag ("Stanfordization")
public static final String paramTagDelim = "TAGDELIM"; //Delimiter for separating words and tags in tagger datasets
public static final String paramFileExt = "FILEEXT"; //File extension for the treebank files
public static final String paramLexMapper = "LEXMAPPER"; //Class name for the LexMapper to use
public static final String paramLexMapOptions = "LEXOPTS";//Option string for the lexmapper (comma-separated)
public static final String paramNoDashTags = "NODASHTAGS";//Remove ATB dash tags
public static final String paramAddRoot = "ADDROOT"; //Add a node "ROOT" to every tree
public static final String paramUnEscape = "UNESCAPE"; //Remove LDC/ATB special characters from flattened output
public static final String paramPosMapper = "POSMAPPER"; //Class name for POS mapper to use
public static final String paramPosMapOptions = "POSOPTS";//Options string for the posmapper (comma-separated)
public static final String paramMaxLen = "MAXLEN"; //Max yield of the trees in the data set
public static final String paramMorph = "MORPH"; //Add the pre-terminal morphological analysis to the leaf (using the delimiter)
public static final String paramTransform = "TVISITOR"; //Apply a custom TreeVisitor to each tree in the dataset
public static final String paramCCTagset = "CC_TAGSET"; // specific to French. TODO: move it to the French dataset
//Absolute parameters
private static final Pattern matchName = Pattern.compile(paramName + DELIM);
private static final Pattern matchSplit = Pattern.compile(paramSplit + DELIM);
private static final Pattern matchDistrib = Pattern.compile(paramDistrib + DELIM);
private static final Pattern matchType = Pattern.compile(paramType + DELIM);
private static final Pattern matchFlat = Pattern.compile(paramFlat + DELIM);
private static final Pattern matchDT = Pattern.compile(paramDT + DELIM);
private static final Pattern matchTagDelim = Pattern.compile(paramTagDelim + DELIM);
private static final Pattern matchFileExt = Pattern.compile(paramFileExt + DELIM);
private static final Pattern matchLexMapper = Pattern.compile(paramLexMapper + DELIM);
private static final Pattern matchNoDashTags = Pattern.compile(paramNoDashTags + DELIM);
private static final Pattern matchAddRoot = Pattern.compile(paramAddRoot + DELIM);
private static final Pattern matchUnEscape = Pattern.compile(paramUnEscape + DELIM);
private static final Pattern matchLexMapOptions = Pattern.compile(paramLexMapOptions + DELIM);
private static final Pattern matchPosMapper = Pattern.compile(paramPosMapper + DELIM);
private static final Pattern matchPosMapOptions = Pattern.compile(paramPosMapOptions + DELIM);
private static final Pattern matchMaxLen = Pattern.compile(paramMaxLen + DELIM);
private static final Pattern matchMorph = Pattern.compile(paramMorph + DELIM);
private static final Pattern matchTransform = Pattern.compile(paramTransform + DELIM);
private static final Pattern matchEncode = Pattern.compile(paramEncode + DELIM);
private static final Pattern matchEncodeArgs = Pattern.compile("Buckwalter|UTF8");
private static final Pattern matchCCTagset = Pattern.compile(paramCCTagset + DELIM);
private static final Pattern booleanArgs = Pattern.compile("true|false");
//Pre-fix parameters
public static final Pattern matchPath = Pattern.compile(paramPath);
public static final Pattern matchOutputPath = Pattern.compile(paramOutputPath);
public static final Pattern matchMapping = Pattern.compile(paramMapping);
//Patterns for the parser
private static final Pattern setDelim = Pattern.compile(";;");
private static final Pattern skipLine = Pattern.compile("^#|^\\s*$");
//Other members
private final List datasetList;
private final Map> patternsMap;
private final String configFile;
public ConfigParser(String filename) {
configFile = filename;
datasetList = new ArrayList();
//For Pair, the first pattern matches the parameter name
//while the second (optionally) accepts the parameter values
patternsMap = Generics.newHashMap();
patternsMap.put(paramName, new Pair(matchName,null));
patternsMap.put(paramType, new Pair(matchType,null));
patternsMap.put(paramPath, new Pair(matchPath,null));
patternsMap.put(paramOutputPath, new Pair(matchOutputPath,null));
patternsMap.put(paramSplit, new Pair(matchSplit,null));
patternsMap.put(paramTagDelim, new Pair(matchTagDelim,null));
patternsMap.put(paramFileExt, new Pair(matchFileExt,null));
patternsMap.put(paramEncode, new Pair(matchEncode,matchEncodeArgs));
patternsMap.put(paramMapping, new Pair(matchMapping,null));
patternsMap.put(paramDistrib, new Pair(matchDistrib,booleanArgs));
patternsMap.put(paramFlat, new Pair(matchFlat,booleanArgs));
patternsMap.put(paramDT, new Pair(matchDT,booleanArgs));
patternsMap.put(paramLexMapper, new Pair(matchLexMapper,null));
patternsMap.put(paramNoDashTags, new Pair(matchNoDashTags,booleanArgs));
patternsMap.put(paramAddRoot, new Pair(matchAddRoot,booleanArgs));
patternsMap.put(paramUnEscape, new Pair(matchUnEscape,booleanArgs));
patternsMap.put(paramLexMapOptions, new Pair(matchLexMapOptions,null));
patternsMap.put(paramPosMapper, new Pair(matchPosMapper,null));
patternsMap.put(paramPosMapOptions, new Pair(matchPosMapOptions,null));
patternsMap.put(paramMaxLen, new Pair(matchMaxLen,null));
patternsMap.put(paramMorph, new Pair(matchMorph,null));
patternsMap.put(paramTransform, new Pair(matchTransform,null));
patternsMap.put(paramCCTagset, new Pair(matchCCTagset,null));
}
public Iterator iterator() {
Iterator itr = Collections.unmodifiableList(datasetList).iterator();
return itr;
}
public void parse() {
int lineNum = 0;
try {
LineNumberReader reader = new LineNumberReader(new FileReader(configFile));
Properties paramsForDataset = null;
while(reader.ready()) {
String line = reader.readLine();
lineNum = reader.getLineNumber(); //For exception handling
Matcher m = skipLine.matcher(line);
if(m.lookingAt()) continue;
m = setDelim.matcher(line);
if(m.matches() && paramsForDataset != null) {
datasetList.add(paramsForDataset);
paramsForDataset = null;
continue;
} else if(paramsForDataset == null) {
paramsForDataset = new Properties();
}
boolean matched = false;
for(String param : patternsMap.keySet()) {
Pair paramTemplate = patternsMap.get(param);
Matcher paramToken = paramTemplate.first.matcher(line);
if(paramToken.lookingAt()) {
matched = true;
String[] tokens = line.split(DELIM);
if(tokens.length != 2) {
System.err.printf("%s: Skipping malformed parameter in %s (line %d)%n", this.getClass().getName(), configFile,reader.getLineNumber());
break;
}
String actualParam = tokens[0].trim();
String paramValue = tokens[1].trim();
if(paramTemplate.second != null) {
paramToken = paramTemplate.second.matcher(paramValue);
if(paramToken.matches()) {
paramsForDataset.setProperty(actualParam, paramValue);
} else {
System.err.printf("%s: Skipping illegal parameter value in %s (line %d)%n", this.getClass().getName(), configFile,reader.getLineNumber());
break;
}
} else {
paramsForDataset.setProperty(actualParam, paramValue);
}
}
}
if (!matched) {
String error = this.getClass().getName() + ": Unknown token in " + configFile + " (line " + reader.getLineNumber() + ")%n";
System.err.printf(error);
throw new IllegalArgumentException(error);
}
}
if(paramsForDataset != null) datasetList.add(paramsForDataset);
reader.close();
} catch (FileNotFoundException e) {
System.err.printf("%s: Cannot open file %s%n", this.getClass().getName(), configFile);
} catch (IOException e) {
System.err.printf("%s: Error reading %s (line %d)%n", this.getClass().getName(), configFile, lineNum);
}
}
@Override
public String toString() {
final int numDatasets = datasetList.size();
StringBuilder sb = new StringBuilder(String.format("Loaded %d datasets: %n",numDatasets));
int dataSetNum = 1;
for(Properties sm : datasetList) {
if(sm.containsKey(paramName))
sb.append(String.format(" %d: %s%n",dataSetNum++, sm.getProperty(paramName)));
else
sb.append(String.format(" %d: %s%n",dataSetNum++,"UNKNOWN NAME"));
}
return sb.toString();
}
public static void main(String[] args) {
ConfigParser cp = new ConfigParser("configurations/sample.conf");
cp.parse();
System.out.println(cp.toString());
for(Properties sm : cp) {
System.out.println("--------------------");
for(String key : sm.stringPropertyNames())
System.out.printf(" %s: %s%n",key,sm.get(key));
}
}
}