All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.international.arabic.pipeline.ATBCorrector Maven / Gradle / Ivy

package edu.stanford.nlp.international.arabic.pipeline;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.Pair;

/**
 * Makes ATB trees consistent with ArabicTreebankLanguagePack. Specifically, it removes
 * sentence-initial punctuation, and constraints sentence-final punctuation to be one of
 * [.!?].
 * 

* Also cleans up some of the headlines, and other weirdly tokenized sentences. * * @author Spence Green * */ public class ATBCorrector implements TreeTransformer { private static final boolean DEBUG = false; private final List> ops; public ATBCorrector() { ops = loadOps(); } private List> loadOps() { List> ops = new ArrayList>(); String line = null; try { BufferedReader br = new BufferedReader(new StringReader(editStr)); List tsp = new ArrayList(); while ((line = br.readLine()) != null) { if (DEBUG) System.err.print("Pattern is " + line); TregexPattern matchPattern = TregexPattern.compile(line); if (DEBUG) System.err.println(" [" + matchPattern + "]"); tsp.clear(); while (continuing(line = br.readLine())) { TsurgeonPattern p = Tsurgeon.parseOperation(line); if (DEBUG) System.err.println("Operation is " + line + " [" + p + "]"); tsp.add(p); } if ( ! tsp.isEmpty()) { TsurgeonPattern tp = Tsurgeon.collectOperations(tsp); ops.add(new Pair(matchPattern, tp)); } } // while not at end of file } catch (IOException ioe) { ioe.printStackTrace(); } return ops; } private static boolean continuing(String str) { return str != null && ! str.matches("\\s*"); } public Tree transformTree(Tree t) { return Tsurgeon.processPatternsOnTree(ops, t); } /** * The Tsurgeon patterns */ private static final String editStr = //Delete sentence-initial punctuation ("@PUNC=punc <: __ >>, (/^S/ > @ROOT) \n" + "prune punc\n" + "\n") + //Delete sentence-initial punctuation (again) ("@PUNC=punc <: __ >>, (/^S/ > @ROOT) \n" + "prune punc\n" + "\n") + //Delete sentence final punctuation that is preceded by punctuation (first time) ("@PUNC=punc >>- (/^S/ > @ROOT) <: __ $, @PUNC \n" + "prune punc\n" + "\n") + //Delete sentence final punctuation that is preceded by punctuation (second time) ("@PUNC=punc >>- (/^S/ > @ROOT) <: __ $, @PUNC \n" + "prune punc\n" + "\n") + //Convert remaining sentence-final punctuation to . if it is not [.!?] ("@PUNC=pos >>- (/^S/ > @ROOT) <: /[^\\.\\?!]/=term !$, @PUNC \n" + "relabel pos PUNC\n" + "relabel term /./\n" + "\n") + //Delete medial, sentence-final punctuation // ("@PUNC=punc <: /[!\\.\\?]+/ $. __\n" // + "prune punc\n" // + "\n") + //Now move the sentence-final mark under the top-level node ("@PUNC=punc <: /^[\\.!\\?]+$/ >>- (/^S/ > @ROOT <- __=sfpos) !> (/^S/ > @ROOT)\n" + "move punc $- sfpos\n" + "\n"); //For those trees that lack a sentence-final punc, add one. // ("/^[^\\.!\\?]$/ >>- (__ > @ROOT <- __=loc) <: __\n" // + "insert (PUNC .) $- loc\n" // + "\n"); /** * @param args */ public static void main(String[] args) { if(args.length != 1) { System.err.println("Usage: java " + ATBCorrector.class.getName() + " filename\n"); System.exit(-1); } TreeTransformer tt = new ATBCorrector(); File f = new File(args[0]); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(); TreeReader tr = trf.newTreeReader(br); int nTrees = 0; for(Tree t; (t = tr.readTree()) != null;nTrees++) { Tree fixedT = tt.transformTree(t); System.out.println(fixedT.toString()); } tr.close(); System.err.printf("Wrote %d trees%n",nTrees); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy