All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.tokenizers.NGramTokenizer Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * NGramTokenizer.java
 * Copyright (C) 2007-2012 University of Waikato
 */

package weka.core.tokenizers;

import java.util.Collections;
import java.util.Enumeration;
import java.util.LinkedList;
import java.util.Vector;

import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.Utils;

/**
 *  Splits a string into an n-gram with min and max
 * grams.
 * 

* * * Valid options are: *

* *

 * -delimiters <value>
 *  The delimiters to use
 *  (default ' \r\n\t.,;:'"()?!').
 * 
* *
 * -max <int>
 *  The max size of the Ngram (default = 3).
 * 
* *
 * -min <int>
 *  The min size of the Ngram (default = 1).
 * 
* * * * @author Sebastian Germesin ([email protected]) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 10971 $ */ public class NGramTokenizer extends CharacterDelimitedTokenizer { /** for serialization */ private static final long serialVersionUID = -2181896254171647219L; /** the maximum number of N */ protected int m_NMax = 3; /** the minimum number of N */ protected int m_NMin = 1; /** the current length of the N-grams */ protected int m_N; /** the number of strings available */ protected int m_MaxPosition; /** the current position for returning elements */ protected int m_CurrentPosition; /** all the available grams */ protected String[] m_SplitString; /** * Returns a string describing the stemmer * * @return a description suitable for displaying in the explorer/experimenter * gui */ @Override public String globalInfo() { return "Splits a string into an n-gram with min and max grams."; } /** * Returns an enumeration of all the available options.. * * @return an enumeration of all available options. */ @Override public Enumeration




© 2015 - 2025 Weber Informatics LLC | Privacy Policy