All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.treebank.AbstractDataset Maven / Gradle / Ivy

package edu.stanford.nlp.trees.treebank;

import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.trees.TreeVisitor;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.util.DataFilePaths;
import edu.stanford.nlp.util.Generics;

/**
 *
 * @author Spence Green
 *
 */
public abstract class AbstractDataset implements Dataset {

  protected final List outputFileList;
  protected Mapper posMapper = null;
  protected String posMapOptions = "";
  protected Mapper lexMapper = null;
  protected String lexMapOptions = "";
  protected Encoding encoding = Encoding.UTF8;
  protected final List pathsToData;
  protected final List pathsToMappings;
  protected FileFilter splitFilter = null;
  protected boolean addDeterminer = false;
  protected boolean removeDashTags = false;
  protected boolean addRoot = false;
  protected boolean removeEscapeTokens = false;
  protected int maxLen = Integer.MAX_VALUE;
  protected String morphDelim = null;
  protected TreeVisitor customTreeVisitor = null;

  protected String outFileName;
  protected String flatFileName;
  protected boolean makeFlatFile = false;
  protected final Pattern fileNameNormalizer = Pattern.compile("\\s+");

  protected Treebank treebank;
  protected final Set configuredOptions;
  protected final Set requiredOptions;
  protected final StringBuilder toStringBuffer;

  protected String treeFileExtension = "tree";    //Current LDC releases use this extension

  /**
   * Provides access for sub-classes to the data set parameters
   */
  protected Properties options;

  public AbstractDataset() {
    outputFileList = new ArrayList<>();
    pathsToData = new ArrayList<>();
    pathsToMappings = new ArrayList<>();
    toStringBuffer = new StringBuilder();

    //Read the raw file as UTF-8 irrespective of output encoding
//    treebank = new DiskTreebank(new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true), "UTF-8");

    configuredOptions = Generics.newHashSet();

    requiredOptions = Generics.newHashSet();
    requiredOptions.add(ConfigParser.paramName);
    requiredOptions.add(ConfigParser.paramPath);
    requiredOptions.add(ConfigParser.paramEncode);
  }

  public abstract void build();

  private Mapper loadMapper(String className) {
    Mapper m = null;
    try {
      Class c = ClassLoader.getSystemClassLoader().loadClass(className);
      m = (Mapper) c.newInstance();
    } catch (ClassNotFoundException e) {
      System.err.printf("%s: Mapper type %s does not exist\n", this.getClass().getName(), className);
    } catch (InstantiationException e) {
      System.err.printf("%s: Unable to instantiate mapper type %s\n", this.getClass().getName(), className);
      e.printStackTrace();
    } catch (IllegalAccessException e) {
      System.err.printf("%s: Unable to access mapper type %s\n", this.getClass().getName(), className);
    }

    return m;
  }

  public boolean setOptions(Properties opts) {
    options = opts;
    List sortedKeys = new ArrayList<>(opts.stringPropertyNames());
    Collections.sort(sortedKeys);
    for(String param : sortedKeys) {
      String value = opts.getProperty(param);
      configuredOptions.add(param);

      //Make matchers for the pre-fix parameters
      Matcher pathMatcher = ConfigParser.matchPath.matcher(param);
      Matcher mapMatcher = ConfigParser.matchMapping.matcher(param);

      if(pathMatcher.lookingAt()) {
        pathsToData.add(new File(value));
        configuredOptions.add(ConfigParser.paramPath);
      }
      else if(mapMatcher.lookingAt()) {
        pathsToMappings.add(new File(value));
        configuredOptions.add(ConfigParser.paramMapping);
      }
      else if(param.equals(ConfigParser.paramEncode))
        encoding = Encoding.valueOf(value);
      else if(param.equals(ConfigParser.paramName)) {
        Matcher inThisFilename = fileNameNormalizer.matcher(value.trim());
        outFileName = inThisFilename.replaceAll("-");
        toStringBuffer.append(String.format("Dataset Name: %s\n",value.trim()));
      }
      else if(param.equals(ConfigParser.paramDT))
        addDeterminer = Boolean.parseBoolean(value);
      else if(param.equals(ConfigParser.paramSplit)) {
        Set sm = buildSplitMap(value);
        splitFilter = new SplitFilter(sm);
      }
      else if(param.equals(ConfigParser.paramFlat) && Boolean.parseBoolean(value))
        makeFlatFile = true;
      else if(param.equals(ConfigParser.paramFileExt))
        treeFileExtension = value;
      else if(param.equals(ConfigParser.paramLexMapper))
        lexMapper = loadMapper(value);
      else if(param.equals(ConfigParser.paramNoDashTags))
        removeDashTags = Boolean.parseBoolean(value);
      else if(param.equals(ConfigParser.paramAddRoot))
        addRoot = Boolean.parseBoolean(value);
      else if(param.equals(ConfigParser.paramUnEscape))
        removeEscapeTokens = true;
      else if(param.equals(ConfigParser.paramLexMapOptions))
        lexMapOptions = value;
      else if(param.equals(ConfigParser.paramPosMapper))
        posMapper = loadMapper(value);
      else if(param.equals(ConfigParser.paramPosMapOptions))
        posMapOptions = value;
      else if(param.equals(ConfigParser.paramMaxLen))
        maxLen = Integer.parseInt(value);
      else if(param.equals(ConfigParser.paramMorph))
        morphDelim = value;
      else if(param.equals(ConfigParser.paramTransform))
        customTreeVisitor = loadTreeVistor(value);
    }

    if(!configuredOptions.containsAll(requiredOptions))
      return false;

    //Finalize the output file names
    if(encoding == Encoding.UTF8)
      outFileName += ".utf8";
    else
      outFileName += ".bw";

    String outputPath = opts.getProperty(ConfigParser.paramOutputPath);
    if(outputPath != null) {
      outFileName = outputPath + File.separator + outFileName;
    }

    if(makeFlatFile)
      flatFileName = outFileName + ".flat.txt";
    outFileName += ".txt";

    return true;
  }

  private static TreeVisitor loadTreeVistor(String value) {
    try {
      Class c = ClassLoader.getSystemClassLoader().loadClass(value);

      return (TreeVisitor) c.newInstance();

    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InstantiationException e) {
      e.printStackTrace();
    } catch (IllegalAccessException e) {
      e.printStackTrace();
    }

    return null;
  }

  protected Set buildSplitMap(String path) {
    path = DataFilePaths.convert(path);
    Set fileSet = Generics.newHashSet();
    LineNumberReader reader = null;
    try {
      reader = new LineNumberReader(new FileReader(path));
      while(reader.ready()) {
        String line = reader.readLine();
        fileSet.add(line.trim());
      }
      reader.close();

    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open split file %s\n", this.getClass().getName(),path);
    } catch (IOException e) {
      System.err.printf("%s: Error reading split file %s (line %d)\n",this.getClass().getName(),path,reader.getLineNumber());
    }
    return fileSet;
  }

  //Filenames of the stuff that was created
  public List getFilenames() {
    return Collections.unmodifiableList(outputFileList);
  }

  @Override
  public String toString() {
    return toStringBuffer.toString();
  }

  /*
   * Accepts a filename if it is present in filterMap. Rejects the filename otherwise.
   */
  protected static class SplitFilter implements FileFilter {
    private final Set filterSet;
    public SplitFilter(Set sm) {
      filterSet = sm;
    }

    @Override
    public boolean accept(File f) {
      return filterSet.contains(f.getName());
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy