All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.utils.analysis.StandardTokenizer Maven / Gradle / Ivy

The newest version!
package eu.fbk.twm.utils.analysis;

import org.apache.log4j.Logger;

import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 1/15/13
 * Time: 2:02 PM
 * To change this templatePageCounter use File | Settings | File Templates.
 */
public class StandardTokenizer extends AbstractTokenizer implements Tokenizer {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named StandardTokenizer.
	 */
	static Logger logger = Logger.getLogger(StandardTokenizer.class.getName());

	private static StandardTokenizer ourInstance = null;

	public static synchronized StandardTokenizer getInstance() {
		if (ourInstance == null) {
			ourInstance = new StandardTokenizer();
		}
		return ourInstance;
	}

	public String[] stringArray(String text) {
		//logger.debug("tokenizing " + text.length() + "...");
		List list = new ArrayList();
		BreakIterator boundary = BreakIterator.getWordInstance();
		boundary.setText(text);
		int start = boundary.first();
		String form = null;
		//int j = -1;
		for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
			form = text.substring(start, end);

			//logger.debug(start + "\t" + end + "\t" + form);
			if (!isSeparatorChar(form.charAt(0))) {
				list.add(form);
			}
		}
		return list.toArray(new String[list.size()]);
	}

	public Token[] tokenArray(String text) //throws Exception
	{
		//logger.debug("tokenizing " + text.length() + "...");
		List list = new ArrayList();
		BreakIterator boundary = BreakIterator.getWordInstance();
		boundary.setText(text);
		int start = boundary.first();
		String form = null;
		//int a = -1;
		for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
			form = text.substring(start, end);

			//logger.debug(start + "\t" + end + "\t" + form);
			if (!isSeparatorChar(form.charAt(0))) {
				list.add(new Token(start, end, form));
			}
		}
		return list.toArray(new Token[list.size()]);
	}


	/*int indexOfApostrophe(String form)
	{
		char ch;
		for (int i=0;i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy