edu.jhu.hlt.tift.Tokenizer Maven / Gradle / Ivy

/*
 * Copyright 2012-2016 Johns Hopkins University HLTCOE. All rights reserved.
 * This software is released under the 2-clause BSD license.
 * See LICENSE in the project root directory.
 */
package edu.jhu.hlt.tift;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.List;

import com.google.common.collect.ImmutableList;

import edu.jhu.hlt.concrete.AnnotationMetadata;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.Sentence;
import edu.jhu.hlt.concrete.TheoryDependencies;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.UUID;
import edu.jhu.hlt.concrete.section.SingleSectionSegmenter;
import edu.jhu.hlt.concrete.util.ConcreteException;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
import edu.jhu.hlt.tift.concrete.ConcreteTokenization;

/**
 * Enumeration of supported tokenizations.
 */
public enum Tokenizer {

  PTB {
    @Override
    public Tokenization tokenizeToConcrete(String text, int textStartPosition) {
      return generateConcreteTokenization(text, textStartPosition);
    }

    @Override
    public List tokenize(String text) {
      return ImmutableList.copyOf(Rewriter.PTB.rewrite(text).split("\\s+"));
    }
  },
  WHITESPACE {
    @Override
    public Tokenization tokenizeToConcrete(String text, int textStartPosition) {
      return generateConcreteTokenization(text, textStartPosition);
    }

    @Override
    public List tokenize(String text) {
      return ImmutableList.copyOf(text.split("\\s+"));
    }
  },
  TWITTER_PETROVIC {
    @Override
    public Tokenization tokenizeToConcrete(String text, int textStartPosition) {
      return generateConcreteTokenization(text, textStartPosition);
    }

    @Override
    public List tokenize(String text) {
      return tokenizeTweetPetrovic(text);
    }
  },
  TWITTER {
    @Override
    public Tokenization tokenizeToConcrete(String text, int textStartPosition) {
      TaggedTokenizationOutput tto = TwitterTokenizer.tokenize(text);
      Tokenization tkz = ConcreteTokenization.generateConcreteTokenization(tto);
      final String tool = "Tift TwitterTokenizer " + ProjectConstants.VERSION;
      tkz.getMetadata().setTool("Tift TwitterTokenizer " + ProjectConstants.VERSION);
      if (tkz.isSetTokenTaggingList())
        tkz.getTokenTaggingListIterator().next().getMetadata().setTool(tool + " Tweet Tags");
      return tkz;
    }

    @Override
    public List tokenize(String text) {
      return ImmutableList.copyOf(TwitterTokenizer.tokenize(text).getTokens());
    }
  },
  BASIC {
    @Override
    public Tokenization tokenizeToConcrete(String text, int textStartPosition) {
      return generateConcreteTokenization(text, textStartPosition);
    }

    @Override
    public List tokenize(String text) {
      return ImmutableList.copyOf(Rewriter.BASIC.rewrite(text).split("\\s+"));
    }
  };

  //////////////////////////////////////////////////
  // Contract methods.
  //////////////////////////////////////////////////
  /**
   * Tokenize a {@link String}, given a character offset.
   *
   * @param text a {@link String} to tokenize
   * @param textStartPosition used to denote offsets with respect to the entire document.
   * For example, if you wish to tokenize the second sentence from the following text:
   *    * He left. He returned later.
   * 
   * call this method with parameters He will return later. and 9.
   * @return a {@link Tokenization} corresponding to this {@link Tokenizer} instance
   *
   * @see #tokenizeToConcrete(String)
   */
  public abstract Tokenization tokenizeToConcrete(String text, int textStartPosition);

  public abstract List tokenize(String text);

  /**
   * Tokenize a string.
   * 


   * For maintaining character offsets, see {@link #tokenizeToConcrete(String, int)}.
   *
   * @param text a {@link String} to tokenize
   * @return a {@link Tokenization} corresponding to this {@link Tokenizer} instance
   *
   * @see #tokenizeToConcrete(String, int)
   */
  public final Tokenization tokenizeToConcrete(String text) {
    return this.tokenizeToConcrete(text, 0);
  }

  /**
   * Mutates a {@link Communication} by adding a {@link Section}, {@link Sentence},
   * and {@link Tokenization}. Assumes that the passed communication has a set
   * text field.
   * 


   * The created section has kind == "content".
   * 


   * If the communication has sections, nothing is done.
   *
   * @param comm a {@link Communication} with no {@link Section}s
   * @throws ConcreteException
   */
  public final void addSectionSentenceTokenizationInPlace(Communication comm) throws ConcreteException {
    if (!comm.isSetSectionList()) {
      AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(comm);
      AnalyticUUIDGenerator g = f.create();
      Section s = SingleSectionSegmenter.createSingleSection(comm, "content");
      s.setUuid(g.next());

      final UUID stu = new UUID(g.next());
      Sentence st = new Sentence()
          .setTextSpan(s.getTextSpan())
          .setUuid(stu);
      s.addToSentenceList(st);

      Tokenization tkz = this.tokenizeToConcrete(comm.getText(), 0);
      tkz.setUuid(g.next());
      TheoryDependencies td = new TheoryDependencies();
      td.addToSentenceTheoryList(stu);
      AnnotationMetadata ptr = tkz.getMetadata();
      ptr.setDependencies(td);
      st.setTokenization(tkz);

      comm.addToSectionList(s);
    }
  }

  //
  // Static methods.
  //
  /**
   * Return the offsets of tokens in text.
   *
   * @param text
   *          - text to be used
   * @param tokens
   * @return an integer array of offsets
   */
  static int[] getOffsets(String text, String[] tokens) {
    int[] r = new int[tokens.length];
    int x = 0;
    for (int i = 0; i < tokens.length; i++) {
      for (int j = x; j < text.length(); j++) {
        if (text.startsWith(tokens[i], j)) {
          r[i] = j;
          x = j + tokens[i].length();
          j = text.length();
        }
      }
    }
    return r;
  }

  /**
   * Sasa Petrovic's tokenization scheme.
   *
   * @param text
   *          - text to tokenize
   * @return a list of Strings that represent tokens.
   */
  static List tokenizeTweetPetrovic(String text) {
    int length = text.length();
    int state = 0;
    String token = "";
    char c;
    int cType;
    boolean update = false;
    ImmutableList.Builder content = new ImmutableList.Builder<>();

    // My (vandurme) one change was to add UPPERCASE_LETTER as another
    // option alongside LOWER_CASE_LETTER
    for (int i = 0; i < length; i++) {
      c = text.charAt(i);
      cType = Character.getType(c);

      switch (state) {
      case 0: // Start state
        token = "";
        if (cType == Character.SPACE_SEPARATOR)
          break;
        // link
        // Characters matched out of order to fail
        // early when not a link.
        else if ((c == 'h') && (i + 6 < length) && (text.charAt(i + 4) == ':') && (text.charAt(i + 5) == '/')) {
          token += c;
          state = 4;
          break;
        }
        // normal
        else if ((cType == Character.LOWERCASE_LETTER) || (cType == Character.UPPERCASE_LETTER) || (cType == Character.DECIMAL_DIGIT_NUMBER)) {
          token += c;
          state = 1;
          break;
        }
        // @reply
        else if (c == '@') {
          token += c;
          state = 2;
          break;
        }
        // #topic
        else if (c == '#') {
          token += c;
          state = 3;
          break;
        } else
          break;
      case 1: // Normal
        if ((cType == Character.LOWERCASE_LETTER) || (cType == Character.UPPERCASE_LETTER) || (cType == Character.DECIMAL_DIGIT_NUMBER)) {
          token += c;
          break;
        } else {
          update = true;
          state = 0;
          break;
        }
      case 2: // @reply
        // Author names may have underscores,
        // which we don't want to split on here
        if ((cType == Character.LOWERCASE_LETTER) || (cType == Character.UPPERCASE_LETTER) || (cType == Character.DECIMAL_DIGIT_NUMBER) || (c == '_')) {
          token += c;
          break;
        } else {
          update = true;
          state = 0;
          break;
        }
      case 3: // #topic
        // This could just be state 1, with special care
        // taken in state 0 when the topic is first
        // recognized, but I'm staying aligned to Sasa's
        // code
        if ((cType == Character.LOWERCASE_LETTER) || (cType == Character.UPPERCASE_LETTER) || (cType == Character.DECIMAL_DIGIT_NUMBER)) {
          token += c;
          break;
        } else {
          update = true;
          state = 0;
          break;
        }
      case 4: // link
        if ((cType == Character.SPACE_SEPARATOR) || (c == '[')) {
          // if ((c == ' ') || (c == '[')) {
          update = true;
          state = 0;
          break;
        } else {
          token += c;
          break;
        }

      default:
        // nothing
        break;
      }

      if (update || ((i == (length - 1)) && (!token.isEmpty()))) {
        content.add(token);
        update = false;
      }
    }

    return content.build();
  }

  /**
   * Wrapper around getOffsets that takes a {@link List} of Strings instead of an array.
   *
   * @see #getOffsets(String, String[])
   *
   * @param text
   *          text that was tokenized
   * @param tokenList
   *          a {@link List} of tokenized text
   * @return an array of integers that represent offsets
   */
  static int[] getOffsets(String text, List tokenList) {
    return getOffsets(text, tokenList.toArray(new String[0]));
  }

  Tokenization generateConcreteTokenization(String text, int startPosition) {
    List tokenList = this.tokenize(text);
    int[] offsets = getOffsets(text, tokenList);
    return ConcreteTokenization.generateConcreteTokenization(tokenList, offsets, startPosition);
  }

  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("expects 2 arguments: tokenizer-type filename");
      System.exit(1);
    }

    Tokenizer t = Tokenizer.valueOf(args[0].toUpperCase());
    try (BufferedReader b = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF-8"));) {
      String line;
      List toks;
      while ((line = b.readLine()) != null) {
        toks = t.tokenize(line);
        if (toks.size() > 0) {
          System.out.print(toks.get(0));
          for (int i = 1; i < toks.size(); i++)
            System.out.print(" " + toks.get(i));
        }
        System.out.println();
      }
    }
  }

}