com.aliasi.tokenizer.NGramTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe:
http://alias-i.com/lingpipe/web/download.html
There were not made any changes to the source code.
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.tokenizer;
/**
* An NGramTokenizer
returns character subsequences
* of specified lengths as tokens. The constructor allows
* the minimum and maximum length of n-gram to be set.
*
* @author Bob Carpenter
* @version 3.8.1
* @since LingPipe1.0
*/
class NGramTokenizer extends Tokenizer {
/**
* Character array from which n-grams are drawn.
*/
private final char[] mChars;
/**
* First character to consider in character array.
*/
private final int mOffset;
/**
* Number of characters to consider in character array.
*/
private final int mLength;
/**
* Maximum length n-gram generated.
*/
private final int mMaxNGram;
/**
* The size of the next n-gram to generate.
*/
private int mCurrentSize;
/**
* The next starting point in the character array from which to
* generate an n-gram.
*/
private int mNextStart;
private int mLastTokenStartPosition = -1;
private int mLastTokenEndPosition = -1;
/**
* Construct an n-gram tokenizer based on the specified character
* array slice, returning n-gram tokens between the specified
* minimum and maximum lengths, inclusive.
*
* @param ch Character array from which to derive tokens.
* @param offset First character to consider.
* @param length Number of characters to consider.
* @param minNGram Minimum length n-gram to return.
* @param maxNGram Maximum length n-gram to return.
* @throws IllegalArgumentException If minNGram > maxNGram
.
*/
public NGramTokenizer(char[] ch, int offset, int length,
int minNGram, int maxNGram) {
if (minNGram > maxNGram) {
String msg = "Require min n-gram to be less than max n-gram"
+ "found min=" + minNGram
+ "found max=" + maxNGram;
throw new IllegalArgumentException(msg);
}
mChars = ch;
mOffset =offset;
mLength = length;
mMaxNGram = maxNGram;
mCurrentSize = minNGram; // don't need to store min n-gram with this
mNextStart = mOffset;
}
/**
* Returns the start position in the underlying character slice
* of the last token returned. For an n-gram tokenizer, this
* is the index of the first character in the n-gram.
*
* @return The index of the character starting the last token
* returned.
*/
@Override
public int lastTokenStartPosition() {
return mLastTokenStartPosition;
}
@Override
public int lastTokenEndPosition() {
return mLastTokenEndPosition;
}
/**
* Returns the next n-gram token for this tokenizer, or null
* if there are no more tokens.
*
* @return Next n-gram token for this tokenizer.
*/
@Override
public String nextToken() {
while (mCurrentSize <= mMaxNGram
&& mNextStart + mCurrentSize > mOffset + mLength) {
++mCurrentSize;
mNextStart = mOffset;
}
if (mCurrentSize > mMaxNGram) return null;
mLastTokenStartPosition = mNextStart-mOffset;
mLastTokenEndPosition = mLastTokenStartPosition + mCurrentSize;
return new String(mChars,mNextStart++,mCurrentSize);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy