All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.jhu.hlt.tift.Rewriter Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2012-2014 Johns Hopkins University HLTCOE. All rights reserved.
 * This software is released under the 2-clause BSD license.
 * See LICENSE in the project root directory.
 */

package edu.jhu.hlt.tift;

import java.util.Set;
import java.util.regex.Pattern;

import com.google.common.collect.ImmutableSet;

/**
 * Enumeration of available "text rewriting" tools.
 */
public enum Rewriter {
  PTB {
    @Override
    public String rewrite(String text) {
      return Rewriter.rewrite(text, PTB_PATTERNS);
    }
  },
  BASIC {
    @Override
    public String rewrite(String text) {
      return Rewriter.rewrite(text, BASIC_PATTERNS);
    }
  },
  COMMON_UNICODE {
    @Override
    public String rewrite(String text) {
      return Rewriter.rewrite(text, COMMON_UNICODE_PATTERNS);
    }
  };

  public abstract String rewrite(String text);

  public static final Set PTB_PATTERNS = getPTBPatterns();
  public static final Set BASIC_PATTERNS = getBasicPatterns();
  public static final Set COMMON_UNICODE_PATTERNS = getCommonUnicodePatterns();

  private static Set getCommonUnicodePatterns () {
    // vandurme: I went through the top 100 unicode characters in a large
    // collection of Spanish tweets, looking for the most common things we would
    // want to rewrite. For the most useful that resulted, below are the
    // frequencies, the unicode, a suggested mapping, and a text description.
    // 88740 \u201c " double-quote
    // 78883 \u201d " right-double-quote
    // 55270 \u2665 <3 black-heart
    // 33534 \u2014 - EM dash
    // 29702 \u263a :) smiley face
    // 20527 \u2026 ... horizontal ellipsis
    // 12903 \u0336 - COMBINING LONG STROKE OVERLAY
    // 11983 \u2588 || full block
    // 11684 \u2639 :( white frowning face
    // 11490 \u00a0   no break space
    // 10251 \ud83d poo-symbol pile of poo
    // 8362 \u266b music-symbol beamed eighth notes
    // 8254 \u2591 light-shade-symbol LIGHT SHADE
    // 7201 \u00bb >> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
    // 7189 \u2022 * bullet
    // 7035 \u00b0 o degree sign
    // 6990 \u266a music-symbol eighth-note
    // 6801 \u2013 - en dash
    // 6515 \u2019 ' single right quotation
    // 6230 \u0338 / COMBINING LONG SOLIDUS OVERLAY
    // 5387 \u00ab << LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
    // 4049 \u2508 ---- BOX DRAWINGS LIGHT QUADRUPLE DASH HORIZONTAL
    // 3933 \u2501 - BOX DRAWINGS HEAVY HORIZONTAL
    // 3736 \u2001   EM QUAD
    // 3665 \u2003   EM SPACE
    // 3594 \u25b8 => BLACK RIGHT-POINTING SMALL TRIANGLE
    // 3544 \u200b   zero width space
    // 3499 \u2500 - BOX DRAWINGS LIGHT HORIZONTAL
    // 3474 \u2611 checkmark-symbol  BALLOT BOX WITH CHECK
    // 3349 \u2503 | BOX DRAWINGS HEAVY VERTICAL
    // 2958 \ud83c game-die-symbol
    // 2884 \u2580 ^ UPPER HALF BLOCK
    // #2833 \u20ac euro-symbol Euro symbol
    // 2744 \u2018 ' single left quotation
    // 2722 \u2661 <3 white heart
    // 2690 \u2605 star-symbol black star
    // 2534 \u2600 sun-symbol BLACK SUN WITH RAYS
    // 2346 \u2550 = BOX DRAWINGS DOUBLE HORIZONTAL
    // 2094 \u0305 - COMBINING OVERLINE

    String[] p = {
            "\u201c", "\"",
            "\u201d", "\"",
            "\u2665", "<3",
            "\u2014", "-",
            "\u263a", ":)",
            "\u2026", "...",
            "\u0336", "-",
            "\u2588", "||",
            "\u2639", ":(",
            "\u00a0", " ",
            "\ud83d", " poo-symbol ",
            "\u266b", " music-symbol ",
            "\u2591", " light-shade-symbol ",
            "\u00bb", "==>",
            "\u300b", "==>", // based on example in twokenize example tweets
            "\u2022", "*",
            "\u00b0", "o",
            "\u266a", " music-symbol ",
            "\u2013", "-",
            "\u2019", "\'",
            "\u0338", "/",
            "\u00ab", "<==",
            "\u2508", "----",
            "\u2501", "-",
            "\u2001", " ",
            "\u2003", " ",
            "\u25b8", "==>",
            "\u200b", " ",
            "\u2500", "-",
            "\u2611", " checkmark-symbol ",
            "\u2503", "|",
            "\ud83c", " gamedie-symbol ",
            "\u2580", "^",
            "\u2018", "\'",
            "\u2661", "<3",
            "\u2605", " star-symbol ",
            "\u2600", " sun-symbol ",
            "\u2550", "=",
            "\u0305", "-" };

    return convertStringArrayPatternsToTupleSet(p);
  }


  /**
   * A conservative version of the PTB patterns, meant to (hopefully) be
   * portable across formal/informal Western (?) languages.
   */
  static Set getBasicPatterns() {
    // cut-n-paste, then modified from getPTBPatterns
    String[] v = {
            // double quotes
            "([\"\u201c\u201d\u201e\u201f\u275d\u275e])", " $1 ",

            // Ellipsis
            "\\.\\.\\.", " ... ",

            "([,;:@#$%&\\*])", " $1 ",

            // HTML escaped (stop gap)
            "& ([gl])t ;", "&$1t;",
            "& nbsp ;", "   ",
            "& hearts ;", " ♥ ",

            // vandurme: carefully with final .
            "([^\\.])(\\.)(\\s|$)", "$1 $2$3",


            // however, we may as well split ALL question marks and exclamation
            // points, since they shouldn't have the abbrev.-marker ambiguity
            // problem.
            //"([\\?!])", " $1 ",
            // vandurme> adding unicode characters
            // \u00a1 : ! inverted
            // \u00bf : ? inverted
            "([\\?!\u00a1\u00bf])", " $1 ",

            // parentheses, brackets, etc.
            "([\\]\\[\\(\\){}<>])", " $1 ",

            "--", " -- "
    };

    return convertStringArrayPatternsToTupleSet(v);
  }

  /**
   * Based on inspection of:
   *
   * http://www.cis.upenn.edu/~treebank/tokenizer.sed
   *
   * The header of which identifies the author as:
   * "Robert MacIntyre, University of Pennsylvania, late 1995".
   */
  public static Set getPTBPatterns() {
    // The following is a port of patterns and comments from tokenizer.sed
    String[] v = {
            // attempt to get correct forward directional quotes, close quotes
            // handled at end
            "^\"", "`` ",
            "([ \\(\\[{<])\"", "$1 `` ",

            "\\.\\.\\.", "...",
            "([,;:@#$%&])", " $1 ",

            // Assume sentence tokenization has been done first, so split FINAL
            // periods only. (vandurme: WARNING this is often not true for us)
            "([^\\.])([\\.])([\\]\\)}>\"']*) *$", "$1 $2$3 ",

            // however, we may as well split ALL question marks and exclamation
            // points, since they shouldn't have the abbrev.-marker ambiguity
            // problem
            "([\\?!])", " $1 ",

            // parentheses, brackets, etc.
            "([\\]\\[\\(\\){}<>])", " $1 ",

            "--", " -- ",

            // NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great,
            // since you might someday want to know how the words originally fit
            // together -- but it's too late to make a better system now, given
            // the millions of words we've already done "wrong".

            // First off, add a space to the beginning and end of each line, to reduce
            // necessary number of regexps.
            "$", " ",
            "^", " ",

            // (vandurme: this is the closing quotation MacIntyre refers to earlier)
            "\"", " '' ",

            // possessive or close-single-quote
            "([^'])' ", "$1 ' ",

            // as in it's, I'm, we'd
            "'([sSmMdD])", " '$1 ",

            "'ll ", " 'll ",
            "'re ", " 're ",
            "'ve ", " 've ",
            "n't ", " n't ",
            "'LL ", " 'LL ",
            "'RE ", " 'RE ",
            "'VE ", " 'VE ",
            "N'T ", " N'T ",

            " ([Cc])annot ", " $1an not",
            " ([Dd])'ye ", " $1' ye",
            " ([Gg])imme ", " $1im me ",
            " ([Gg])onna ", " $1on na ",
            " ([Gg])otta ", " $1ot ta ",
            " ([Ll])emme ", " $1em me ",
            " ([Mm])ore'n ", " $1ore 'n ",
            " ('[Tt])is ", " $1 is ",
            " ('[Tt])was ", " $1 was ",
            " ([Ww])anna ", " $1an na ",
            //" ([Ww])haddya ", " $1ha dd ya ",
            //" ([Ww]hatcha ", " $1ha t cha ",

            // clean out extra spaces
            " +", " ",
            "^ +", ""
    };

    return convertStringArrayPatternsToTupleSet(v);
  }

  private static Set convertStringArrayPatternsToTupleSet(String[] patternArray) {
    ImmutableSet.Builder patterns = new ImmutableSet.Builder<>();
    for (int i = 0; i < patternArray.length - 1; i += 2)
      patterns.add(new PatternStringTuple(Pattern.compile(patternArray[i], Pattern.MULTILINE), patternArray[i + 1]));

    return patterns.build();
  }

  private static String rewrite(String text, Set patterns) {
    String x = text;
    for (PatternStringTuple pair : patterns)
      x = pair.getPattern().matcher(x).replaceAll(pair.getEntry());

    return x.trim();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy