org.allenai.nlpstack.tokenize.PennTokenizer.scala Maven / Gradle / Ivy
The newest version!
package org.allenai.nlpstack.tokenize
import org.allenai.nlpstack.core.Tokenizer
import java.util.regex._
/* The PennTokenizer was used to tokenize the Penn Treebank.
* The following is a translation from a sed file. This algorithm
* is entirely deterministic. It is composed of regular expression
* replacements.
*
* @author Michael Schmitz
*/
object PennTokenizer extends Tokenizer {
val replacements = List(
// attempt to get correct directional quotes
("^\"", "`` "),
//("""([ (\[{<])""", "$1 `` "),
("""\.\.\.""", " ... "),
("[,;:@#$%&]", " $0 "),
("""([^.]\)\([.])([])}>"']*)[ ]*$""", "$1 $2$3 "), // scalastyle:ignore
("[?!]", " $0 "),
("""[](){}<>]""", " $0 "),
("--", " $0 "),
("$|^", " "),
("\"", " '' "),
(""" ([^'])' """, " '$1 "),
("""'([sSmMdD]) """, " '$1 "),
("'(ll|re|ve|LL|RE|VE) ", " '$1 "),
("(n't|N'T) ", " $1 ")
).map {
case (a, b) =>
(Pattern.compile(a), b)
}
def tokenize(sentence: String) = {
val split = replacements.foldRight(sentence) {
case ((t, r), s) =>
t.matcher(s).replaceAll(r)
}.trim.split("\\s+")
Tokenizer.computeOffsets(split, sentence)
}
}