All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.knowitall.tool.tokenize.PTBTokenizer.scala Maven / Gradle / Ivy

package edu.knowitall
package tool
package tokenize

import scala.collection.JavaConversions._
import breeze.text.tokenize.{ PTBTokenizer => BreezePTBTokenizer }

class PTBTokenizer extends Tokenizer {
  val tokenizer = BreezePTBTokenizer

  def tokenize(sentence: String): Seq[Token] = {
    val strings = tokenizer(sentence) map {
      case "-LRB-" => "("
      case "-RRB-" => ")"
      case "-LSB-" => "["
      case "-RSB-" => "]"
      case "-LCB-" => "{"
      case "-RCB-" => "}"
      case "``" => "\""
      case "''" => "\""
      case s => s
    }
    Tokenizer.computeOffsets(strings, sentence)
  }
}

object PTBTokenizerMain extends TokenizerMain {
  val tokenizer = new PTBTokenizer()
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy