All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.tools.PunctEquivalenceClasser Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.parser.tools;

import java.util.Arrays;
import java.util.Set;
import java.util.regex.Pattern;

import edu.stanford.nlp.util.Generics;

/**
 * Performs equivalence classing of punctuation per PTB guidelines. Many of the multilingual
 * treebanks mark all punctuation with a single POS tag, which is bad for parsing.
 * 

* PTB punctuation POS tag set (12 tags): * * 37. # Pound sign * 38. $ Dollar sign * 39. . Sentence-final punctuation * 40. , Comma * 41. : Colon, semi-colon * 42. ( Left bracket character * 43. ) Right bracket character * 44. " Straight double quote * 45. ` Left open single quote * 46. " Left open double quote * 47. ' Right close single quote * 48. " Right close double quote *

* See http://www.ldc.upenn.edu/Catalog/docs/LDC95T7/cl93.html * * @author Spence Green * */ public class PunctEquivalenceClasser { private static final String[] eolClassRaw = {".","?","!"}; private static final Set sfClass = Generics.newHashSet(Arrays.asList(eolClassRaw)); private static final String[] colonClassRaw = {":",";","-","_"}; private static final Set colonClass = Generics.newHashSet(Arrays.asList(colonClassRaw)); private static final String[] commaClassRaw = {",","ر"}; private static final Set commaClass = Generics.newHashSet(Arrays.asList(commaClassRaw)); private static final String[] currencyClassRaw = {"$","#","="}; private static final Set currencyClass = Generics.newHashSet(Arrays.asList(currencyClassRaw)); private static final Pattern pEllipsis = Pattern.compile("\\.\\.+"); private static final String[] slashClassRaw = {"/","\\"}; private static final Set slashClass = Generics.newHashSet(Arrays.asList(slashClassRaw)); private static final String[] lBracketClassRaw = {"-LRB-","(","[","<"}; private static final Set lBracketClass = Generics.newHashSet(Arrays.asList(lBracketClassRaw)); private static final String[] rBracketClassRaw = {"-RRB-",")","]",">"}; private static final Set rBracketClass = Generics.newHashSet(Arrays.asList(rBracketClassRaw)); private static final String[] quoteClassRaw = {"\"","``","''","'","`"}; private static final Set quoteClass = Generics.newHashSet(Arrays.asList(quoteClassRaw)); /** * Return the equivalence class of the argument. If the argument is not contained in * and equivalence class, then an empty string is returned. * * @param punc * @return The class name if found. Otherwise, an empty string. */ public static String getPunctClass(String punc) { if(punc.equals("%") || punc.equals("-PLUS-"))//-PLUS- is an escape for "+" in the ATB return "perc"; else if(punc.startsWith("*")) return "bullet"; else if(sfClass.contains(punc)) return "sf"; else if(colonClass.contains(punc) || pEllipsis.matcher(punc).matches()) return "colon"; else if(commaClass.contains(punc)) return "comma"; else if(currencyClass.contains(punc)) return "curr"; else if(slashClass.contains(punc)) return "slash"; else if(lBracketClass.contains(punc)) return "lrb"; else if(rBracketClass.contains(punc)) return "rrb"; else if(quoteClass.contains(punc)) return "quote"; return ""; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy