com.github.chen0040.data.text.BasicTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-data-text Show documentation
Java implementation of text processing such as stemmers
There is a newer version: 1.0.3
package com.github.chen0040.data.text;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Created by xschen on 9/10/15.
 */
public class BasicTokenizer implements Tokenizer, Serializable {



    /** A regular expression for letters and numbers. */
    private static final String regexLetterNumber = "[a-zA-Z0-9]";

    /** A regular expression for non-letters and non-numbers. */
    private static final String regexNotLetterNumber = "[^a-zA-Z0-9]";

    /** A regular expression for separators. */
    private static final String regexSeparator = "[\\?!()\";/\\|`]";

    /** A regular expression for separators. */
    private static final String regexClitics =
            "'|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't";

    /** Abbreviations. */
    private static final List abbrList =
            Arrays.asList("Co.", "Corp.", "vs.", "e.g.", "etc.", "ex.", "cf.",
                    "eg.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.",
                    "Sept.", "Oct.", "Nov.", "Dec.", "jan.", "feb.", "mar.",
                    "apr.", "jun.", "jul.", "aug.", "sept.", "oct.", "nov.",
                    "dec.", "ed.", "eds.", "repr.", "trans.", "vol.", "vols.",
                    "rev.", "est.", "b.", "m.", "bur.", "d.", "r.", "M.", "Dept.",
                    "MM.", "U.", "Mr.", "Jr.", "Ms.", "Mme.", "Mrs.", "Dr.",
                    "Ph.D.");
    private static final long serialVersionUID = -999803747111655623L;


    public BasicTokenizer() {

    }

    /**
     * Tokenizes a string using the algorithms by Grefenstette (1999) and
     * Palmer (2000).
     */
    public List tokenize(String str) {

        List tokenList = new ArrayList<>();

        // Changes tabs into spaces.
        str = str.replaceAll("\\t", " ");

        // Puts blanks around unambiguous separators.
        str = str.replaceAll("(" + regexSeparator + ")", " $1 ");

        // Puts blanks around commas
        str = str.replaceAll("([^\\s]),", "$1 ,");
        str = str.replaceAll(",([^\\s])", " , $1");

        // Distinguishes single quotes from apstrophes by segmenting off
        // single quotes not preceded by letters.
        str = str.replaceAll("^(')", "$1 ");
        str = str.replaceAll("(" + regexNotLetterNumber + ")'", "$1 '");

        // Segments off unambiguous word-final clitics and punctuations.
        str = str.replaceAll("(" + regexClitics + ")$", " $1");
        str = str.replaceAll(
                "(" + regexClitics + ")(" + regexNotLetterNumber + ")",
                " $1 $2");

        // Deals with periods.
        String[] words = str.trim().split("\\s+");
        Pattern p1 = Pattern.compile(".*" + regexLetterNumber + "\\.");
        Pattern p2 = Pattern.compile(
                "^([A-Za-z]\\.([A-Za-z]\\.)+|[A-Z][bcdfghj-nptvxz]+\\.)$");
        for (String word : words) {
            Matcher m1 = p1.matcher(word);
            Matcher m2 = p2.matcher(word);
            if (m1.matches() && !abbrList.contains(word) && !m2.matches()) {
                // Segments off the period.
                tokenList.add(word.substring(0, word.length() - 1));
                tokenList.add(word.substring(word.length() - 1));
            } else {
                tokenList.add(word);
            }
        }

        return tokenList;
    }

    private static BasicTokenizer tokenizer;

    private static BasicTokenizer getTokenizer(){
        if(tokenizer==null){
            tokenizer = new BasicTokenizer();
        }
        return tokenizer;
    }

    public static List doTokenize(String text){
        return getTokenizer().tokenize(text);
    }

    public static List doTokenize(List text){
        List result = new ArrayList<>();
        for(int i=0; i < text.size(); ++i){
            result.addAll(doTokenize(text.get(i)));
        }
        return result;
    }
}