All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.tokenizers.NGramTokenizer Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.

There is a newer version: 3.8.6
Show newest version
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * NGramTokenizer.java
 * Copyright (C) 2007 University of Waikato
 */

package weka.core.tokenizers;

import java.util.Enumeration;
import java.util.LinkedList;
import java.util.Vector;

import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.Utils;

/**
 *  Splits a string into an n-gram with min and max
 * grams.
 * 

* * * Valid options are: *

* *

 * -delimiters <value>
 *  The delimiters to use
 *  (default ' \r\n\t.,;:'"()?!').
 * 
* *
 * -max <int>
 *  The max size of the Ngram (default = 3).
 * 
* *
 * -min <int>
 *  The min size of the Ngram (default = 1).
 * 
* * * * @author Sebastian Germesin ([email protected]) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.4 $ */ public class NGramTokenizer extends CharacterDelimitedTokenizer { /** for serialization */ private static final long serialVersionUID = -2181896254171647219L; /** the maximum number of N */ protected int m_NMax = 3; /** the minimum number of N */ protected int m_NMin = 1; /** the current length of the N-grams */ protected int m_N; /** the number of strings available */ protected int m_MaxPosition; /** the current position for returning elements */ protected int m_CurrentPosition; /** all the available grams */ protected String[] m_SplitString; /** * Returns a string describing the stemmer * * @return a description suitable for displaying in the explorer/experimenter * gui */ @Override public String globalInfo() { return "Splits a string into an n-gram with min and max grams."; } /** * Returns an enumeration of all the available options.. * * @return an enumeration of all available options. */ @Override public Enumeration listOptions() { Vector result; Enumeration enm; result = new Vector(); enm = super.listOptions(); while (enm.hasMoreElements()) { result.addElement(enm.nextElement()); } result.addElement(new Option( "\tThe max size of the Ngram (default = 3).", "max", 1, "-max ")); result.addElement(new Option( "\tThe min size of the Ngram (default = 1).", "min", 1, "-min ")); return result.elements(); } /** * Gets the current option settings for the OptionHandler. * * @return the list of current option settings as an array of strings */ @Override public String[] getOptions() { Vector result; String[] options; int i; result = new Vector(); options = super.getOptions(); for (i = 0; i < options.length; i++) { result.add(options[i]); } result.add("-max"); result.add("" + getNGramMaxSize()); result.add("-min"); result.add("" + getNGramMinSize()); return result.toArray(new String[result.size()]); } /** * Parses a given list of options. *

* * Valid options are: *

* *

   * -delimiters <value>
   *  The delimiters to use
   *  (default ' \r\n\t.,;:'"()?!').
   * 
* *
   * -max <int>
   *  The max size of the Ngram (default = 3).
   * 
* *
   * -min <int>
   *  The min size of the Ngram (default = 1).
   * 
* * * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String value; super.setOptions(options); value = Utils.getOption("max", options); if (value.length() != 0) { setNGramMaxSize(Integer.parseInt(value)); } else { setNGramMaxSize(3); } value = Utils.getOption("min", options); if (value.length() != 0) { setNGramMinSize(Integer.parseInt(value)); } else { setNGramMinSize(1); } } /** * Gets the max N of the NGram. * * @return the size (N) of the NGram. */ public int getNGramMaxSize() { return m_NMax; } /** * Sets the max size of the Ngram. * * @param value the size of the NGram. */ public void setNGramMaxSize(int value) { if (value < 1) { m_NMax = 1; } else { m_NMax = value; } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String NGramMaxSizeTipText() { return "The max N of the NGram."; } /** * Sets the min size of the Ngram. * * @param value the size of the NGram. */ public void setNGramMinSize(int value) { if (value < 1) { m_NMin = 1; } else { m_NMin = value; } } /** * Gets the min N of the NGram. * * @return the size (N) of the NGram. */ public int getNGramMinSize() { return m_NMin; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String NGramMinSizeTipText() { return "The min N of the NGram."; } /** * returns true if there's more elements available * * @return true if there are more elements available */ @Override public boolean hasMoreElements() { // return (m_CurrentPosition < m_MaxPosition && // m_N - 1 + m_CurrentPosition < m_MaxPosition && // m_N >= m_NMin); return (m_N >= m_NMin); } /** * Returns N-grams and also (N-1)-grams and .... and 1-grams. * * @return the next element */ @Override public Object nextElement() { String retValue = ""; // for (int i = 0; i < m_N && i + m_CurrentPosition < m_MaxPosition; i++) // retValue += " " + m_SplitString[m_CurrentPosition + i]; // for (int i = 0; i < m_N; i++) { retValue += " " + m_SplitString[m_CurrentPosition + i]; } m_CurrentPosition++; if (m_CurrentPosition + m_N - 1 == m_MaxPosition) { m_CurrentPosition = 0; m_N--; } return retValue.trim(); } /** * filters out empty strings in m_SplitString and replaces m_SplitString with * the cleaned version. * * @see #m_SplitString */ protected void filterOutEmptyStrings() { String[] newSplit; LinkedList clean = new LinkedList(); for (int i = 0; i < m_SplitString.length; i++) { if (!m_SplitString[i].equals("")) { clean.add(m_SplitString[i]); } } newSplit = new String[clean.size()]; for (int i = 0; i < clean.size(); i++) { newSplit[i] = clean.get(i); } m_SplitString = newSplit; } /** * Sets the string to tokenize. Tokenization happens immediately. * * @param s the string to tokenize */ @Override public void tokenize(String s) { m_N = m_NMax; m_SplitString = s.split("[" + getDelimiters() + "]"); filterOutEmptyStrings(); m_CurrentPosition = 0; m_MaxPosition = m_SplitString.length; if (m_SplitString.length < m_NMax) { m_N = m_SplitString.length; } } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 1.4 $"); } /** * Runs the tokenizer with the given options and strings to tokenize. The * tokens are printed to stdout. * * @param args the commandline options and strings to tokenize */ public static void main(String[] args) { runTokenizer(new NGramTokenizer(), args); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy