edu.stanford.nlp.international.arabic.IBMArabicEscaper Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.international.arabic;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.*;

import edu.stanford.nlp.international.arabic.pipeline.DefaultLexicalMapper;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import java.util.function.Function;


/**
 * This escaper is intended for use on flat input to be parsed by LexicalizedParser.
 * It performs these functions functions:
 * 
 *  Deletes the clitic markers inserted by the IBM segmenter ('#' and '+')
 *  
Deletes IBM classing for numbers
 *  
Replaces tokens that must be escaped with the appropriate LDC escape sequences
 *  
Applies the same orthographic normalization performed by {@link edu.stanford.nlp.trees.international.arabic.ArabicTreeNormalizer}
 *  
intern()'s strings
 * 
 * 
 * This class supports both Buckwalter and UTF-8 encoding.
 * 

 * IMPORTANT: This class must implement Function, List>
 * in order to run with the parser.
 *
 * @author Christopher Manning
 * @author Spence Green
 */
public class IBMArabicEscaper implements Function, List> {

  private static final Pattern pEnt = Pattern.compile("\\$[a-z]+_\\((.*?)\\)");
  private boolean warnedEntityEscaping = false;
  private boolean warnedProcliticEnclitic = false;
  private final DefaultLexicalMapper lexMapper;
  private final boolean annotationsAndClassingOnly;

  public IBMArabicEscaper() {
    this(false);
  }

  public IBMArabicEscaper(boolean annoteAndClassOnly) {
    annotationsAndClassingOnly = annoteAndClassOnly;
    lexMapper = new DefaultLexicalMapper();
  }

  /**
   * Disable warnings generated when tokens are escaped.
   */
  public void disableWarnings() {
    warnedEntityEscaping = true;
    warnedProcliticEnclitic = true;
  }


  /**
   * Escapes a word. This method will *not* map a word to the null string.
   *
   * @return The escaped string
   */
  private String escapeString(String word) {

    String firstStage = stripAnnotationsAndClassing(word);

    String secondStage = ATBTreeUtils.escape(firstStage);
    if(secondStage.length() == 0)
      return firstStage;
    else if(!firstStage.equals(secondStage))
      return secondStage;

    String thirdStage = lexMapper.map(null, secondStage);
    if(thirdStage.length() == 0)
      return secondStage;
    return thirdStage;

    //    Matcher mAM = pAM.matcher(w);
    //    if (mAM.find()) {
    //      if ( ! warnedNormalization) {
    //        System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + w);
    //        warnedNormalization = true;
    //      }
    //      // 'alif maqSuura mapped to yaa
    //      w = mAM.replaceAll("\u064A");
    //    }
    //    Matcher mYH = pYaaHamza.matcher(w);
    //    if (mYH.find()) {
    //      if ( ! warnedNormalization) {
    //        System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + w);
    //        warnedNormalization = true;
    //      }
    //      // replace yaa followed by hamza with hamza on kursi (yaa)
    //      w = mYH.replaceAll("\u0626");
    //    }
    //    w = StringUtils.tr(w, "\u060C\u061B\u061F\u066A\u066B\u066C\u066D\u06D4\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669\u0966\u0967\u0968\u0969\u096A\u096B\u096C\u096D\u096E\u096F\u2013\u2014\u0091\u0092\u2018\u2019\u0093\u0094\u201C\u201D",
    //    ",;%.,*.01234567890123456789--''''\"\"\"\"");
  }

  /**
   * Removes IBM clitic annotations and classing from a word.
   * 
   * Note: We do not want to nullify a word, so we only perform these operations
   * on words of length 1 or more.
   *
   * @param word The unescaped word
   * @return The escaped word
   */
  private String stripAnnotationsAndClassing(String word) {
    String w = word;
    final int wLen = w.length();

    if (wLen > 1) {  // only for two or more letter words
      Matcher m2 = pEnt.matcher(w);
      if (m2.matches()) {
        if ( ! warnedEntityEscaping) {
          System.err.printf("%s: Removing IBM MT-style classing: %s --> %s\n", this.getClass().getName(), m2.group(0), m2.group(1));
          warnedEntityEscaping = true;
        }
        w = m2.replaceAll("$1");

      } else if (w.charAt(0) == '+') {
        if ( ! warnedProcliticEnclitic) {
          warnedProcliticEnclitic = true;
          System.err.printf("%s: Removing IBM MT-style proclitic/enclitic indicators\n",this.getClass().getName());
        }
        w = w.substring(1);

      } else if (w.charAt(wLen - 1) == '#') {
        if ( ! warnedProcliticEnclitic) {
          warnedProcliticEnclitic = true;
          System.err.printf("%s: Removing IBM MT-style proclitic/enclitic indicators\n",this.getClass().getName());
        }
        w = w.substring(0, wLen - 1);
      }
    }

    //Don't map a word to null
    if(w.length() == 0)
      return word;
    return w;
  }


  /** Converts an input list of {@link HasWord} in IBM Arabic to
   *  LDC ATBv3 representation. The method safely copies the input object
   *  prior to escaping.
   *
   *  @param sentence A collection of type {@link edu.stanford.nlp.ling.Word}
   *  @return A copy of the input with each word escaped.
   *  @throws RuntimeException If a word is mapped to null
   */
  public List apply(List sentence) {
    List newSentence = new ArrayList<>(sentence);

    for (HasWord wd : newSentence)
      wd.setWord(apply(wd.word()));

    return newSentence;
  }

  /**
   * Applies escaping to a single word. Interns the escaped string.
   *
   * @param w The word
   * @return The escaped word
   * @throws RuntimeException If a word is nullified (which is really bad for the parser and
   * for MT)
   */
  public String apply(String w) {

    String escapedWord = (annotationsAndClassingOnly) ?
        stripAnnotationsAndClassing(w) : escapeString(w);

    if(escapedWord.equals(""))
      throw new RuntimeException(String.format("Word (%s) mapped to null",w));

    return escapedWord.intern();
  }

  /** This main method preprocesses one-sentence-per-line input, making the
   *  same changes as the Function.  By default it writes the output to files
   *  with the same name as the files passed in on the command line but with 
   *  .sent appended to their names.  If you give the flag 
   *  -f then output is instead sent to stdout.  Input and output
   *  is always in UTF-8.
   *
   *  @param args A list of filenames.  The files must be UTF-8 encoded.
   *  @throws IOException If there are any issues
   */
  public static void main(String[] args) throws IOException {
    IBMArabicEscaper escaper = new IBMArabicEscaper();
    boolean printToStdout = false;
    for (String arg : args) {
      if ("-f".equals(arg)) {
        printToStdout = true;
        continue;
      }
      BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(arg), "UTF-8"));
      PrintWriter pw;
      if (printToStdout) {
        pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(System.out, "UTF-8")));
      } else {
        String outFile = arg + ".sent";
        pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), "UTF-8")));
      }
      for (String line ; (line = br.readLine()) != null; ) {
        String[] words = line.split("\\s+");
        for (int i = 0; i < words.length; i++) {
          String w = escaper.escapeString(words[i]);
          pw.print(w);
          if (i != words.length - 1) {
            pw.print(" ");
          }
        }
        pw.println();
      }
      br.close();
      pw.close();
    }
  }

}