All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.cmu.sv.spoken_language_understanding.Tokenizer Maven / Gradle / Ivy

Go to download

A library that allows rapid prototyping of dialog systems (language understanding, discourse modelling, dialog management, language generation).

There is a newer version: 0.7.0
Show newest version
package edu.cmu.sv.spoken_language_understanding;

import java.util.Arrays;
import java.util.List;

/**
 * Created by David Cohen on 12/29/14.
 */
public class Tokenizer {
    public static List tokenize(String inputString){
        inputString = inputString.toLowerCase();

        // get rid of , , etc.
        inputString = inputString.replaceAll("<.*?>", "");

        // split up contractions
        inputString = inputString.replaceAll("n't", " not");
        inputString = inputString.replaceAll("'re", " are");
        inputString = inputString.replaceAll("'ll", " will");
        inputString = inputString.replaceAll("i'm", "i am");
        inputString = inputString.replaceAll("'s", " 's");
        inputString = inputString.replaceAll("s' |s'$", "s 's");
        inputString = inputString.replaceAll("'d", " 'd");

        inputString = inputString.trim();
        // split at white space, between alpha and num, between num and alpha
        return Arrays.asList(inputString.split("\\s+|(?<=\\p{Alpha})(?=\\p{Digit})|(?<=\\p{Digit})(?=\\p{Alpha})"));
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy