weka.core.tokenizers.NGramTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-stable Show documentation
Show all versions of weka-stable Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This is the stable version. Apart from bugfixes, this version
does not receive any other updates.
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* NGramTokenizer.java
* Copyright (C) 2007 University of Waikato
*/
package weka.core.tokenizers;
import java.util.Enumeration;
import java.util.LinkedList;
import java.util.Vector;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.Utils;
/**
* Splits a string into an n-gram with min and max
* grams.
*
*
*
* Valid options are:
*
*
*
* -delimiters <value>
* The delimiters to use
* (default ' \r\n\t.,;:'"()?!').
*
*
*
* -max <int>
* The max size of the Ngram (default = 3).
*
*
*
* -min <int>
* The min size of the Ngram (default = 1).
*
*
*
*
* @author Sebastian Germesin ([email protected])
* @author FracPete (fracpete at waikato dot ac dot nz)
* @version $Revision: 1.4 $
*/
public class NGramTokenizer
extends CharacterDelimitedTokenizer {
/** for serialization */
private static final long serialVersionUID = -2181896254171647219L;
/** the maximum number of N */
protected int m_NMax = 3;
/** the minimum number of N */
protected int m_NMin = 1;
/** the current length of the N-grams */
protected int m_N;
/** the number of strings available */
protected int m_MaxPosition;
/** the current position for returning elements */
protected int m_CurrentPosition;
/** all the available grams */
protected String[] m_SplitString;
/**
* Returns a string describing the stemmer
*
* @return a description suitable for displaying in the explorer/experimenter
* gui
*/
@Override
public String globalInfo() {
return "Splits a string into an n-gram with min and max grams.";
}
/**
* Returns an enumeration of all the available options..
*
* @return an enumeration of all available options.
*/
@Override
public Enumeration listOptions() {
Vector result;
Enumeration enm;
result = new Vector();
enm = super.listOptions();
while (enm.hasMoreElements()) {
result.addElement(enm.nextElement());
}
result.addElement(new Option(
"\tThe max size of the Ngram (default = 3).",
"max", 1, "-max "));
result.addElement(new Option(
"\tThe min size of the Ngram (default = 1).",
"min", 1, "-min "));
return result.elements();
}
/**
* Gets the current option settings for the OptionHandler.
*
* @return the list of current option settings as an array of strings
*/
@Override
public String[] getOptions() {
Vector result;
String[] options;
int i;
result = new Vector();
options = super.getOptions();
for (i = 0; i < options.length; i++) {
result.add(options[i]);
}
result.add("-max");
result.add("" + getNGramMaxSize());
result.add("-min");
result.add("" + getNGramMinSize());
return result.toArray(new String[result.size()]);
}
/**
* Parses a given list of options.
*
*
* Valid options are:
*
*
*
* -delimiters <value>
* The delimiters to use
* (default ' \r\n\t.,;:'"()?!').
*
*
*
* -max <int>
* The max size of the Ngram (default = 3).
*
*
*
* -min <int>
* The min size of the Ngram (default = 1).
*
*
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
@Override
public void setOptions(String[] options) throws Exception {
String value;
super.setOptions(options);
value = Utils.getOption("max", options);
if (value.length() != 0) {
setNGramMaxSize(Integer.parseInt(value));
} else {
setNGramMaxSize(3);
}
value = Utils.getOption("min", options);
if (value.length() != 0) {
setNGramMinSize(Integer.parseInt(value));
} else {
setNGramMinSize(1);
}
}
/**
* Gets the max N of the NGram.
*
* @return the size (N) of the NGram.
*/
public int getNGramMaxSize() {
return m_NMax;
}
/**
* Sets the max size of the Ngram.
*
* @param value the size of the NGram.
*/
public void setNGramMaxSize(int value) {
if (value < 1) {
m_NMax = 1;
} else {
m_NMax = value;
}
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String NGramMaxSizeTipText() {
return "The max N of the NGram.";
}
/**
* Sets the min size of the Ngram.
*
* @param value the size of the NGram.
*/
public void setNGramMinSize(int value) {
if (value < 1) {
m_NMin = 1;
} else {
m_NMin = value;
}
}
/**
* Gets the min N of the NGram.
*
* @return the size (N) of the NGram.
*/
public int getNGramMinSize() {
return m_NMin;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String NGramMinSizeTipText() {
return "The min N of the NGram.";
}
/**
* returns true if there's more elements available
*
* @return true if there are more elements available
*/
@Override
public boolean hasMoreElements() {
// return (m_CurrentPosition < m_MaxPosition &&
// m_N - 1 + m_CurrentPosition < m_MaxPosition &&
// m_N >= m_NMin);
return (m_N >= m_NMin);
}
/**
* Returns N-grams and also (N-1)-grams and .... and 1-grams.
*
* @return the next element
*/
@Override
public Object nextElement() {
String retValue = "";
// for (int i = 0; i < m_N && i + m_CurrentPosition < m_MaxPosition; i++)
// retValue += " " + m_SplitString[m_CurrentPosition + i];
//
for (int i = 0; i < m_N; i++) {
retValue += " " + m_SplitString[m_CurrentPosition + i];
}
m_CurrentPosition++;
if (m_CurrentPosition + m_N - 1 == m_MaxPosition) {
m_CurrentPosition = 0;
m_N--;
}
return retValue.trim();
}
/**
* filters out empty strings in m_SplitString and replaces m_SplitString with
* the cleaned version.
*
* @see #m_SplitString
*/
protected void filterOutEmptyStrings() {
String[] newSplit;
LinkedList clean = new LinkedList();
for (int i = 0; i < m_SplitString.length; i++) {
if (!m_SplitString[i].equals("")) {
clean.add(m_SplitString[i]);
}
}
newSplit = new String[clean.size()];
for (int i = 0; i < clean.size(); i++) {
newSplit[i] = clean.get(i);
}
m_SplitString = newSplit;
}
/**
* Sets the string to tokenize. Tokenization happens immediately.
*
* @param s the string to tokenize
*/
@Override
public void tokenize(String s) {
m_N = m_NMax;
m_SplitString = s.split("[" + getDelimiters() + "]");
filterOutEmptyStrings();
m_CurrentPosition = 0;
m_MaxPosition = m_SplitString.length;
if (m_SplitString.length < m_NMax) {
m_N = m_SplitString.length;
}
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 1.4 $");
}
/**
* Runs the tokenizer with the given options and strings to tokenize. The
* tokens are printed to stdout.
*
* @param args the commandline options and strings to tokenize
*/
public static void main(String[] args) {
runTokenizer(new NGramTokenizer(), args);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy