All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.tokenizer.Tokenization Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */
package com.aliasi.tokenizer;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Strings;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * A {@code Tokenization} represents the result of tokenizing a
 * string.  Tokenizations are constructed from a character sequence
 * and a tokenizer factory.  A tokenization contains the underlying
 * text, tokens, and token start/end positions in the text.
 *
 * 

Equality and Hash Codes

* * Two tokenizations are equal if they have the same text, tokens, * whitespaces, and start/end positions for the tokens. * *

Hash codes are consistent with equality. They only depend on * the text and number of tokens. * *

Serialization

* * A tokenization may be serialized. Deserialization should * produce an identical tokenization. * *

Thread Safety

* * After safely published, objects are completely thread safe. * The text and tokenizer factory should not be modified concurrently * with construction. * * @author Bob Carpenter * @version 3.9 * @since LingPipe3.9 */ public class Tokenization implements Serializable { static final long serialVersionUID = 3806073293589459401L; private final String mText; private final List mTokens; private final List mWhitespaces; private final int[] mTokenStarts; private final int[] mTokenEnds; /** * Construct a tokenization from the specified text and tokenizer * factory. The text is converted to a string so that subsequent * changes to the text will not affect this class. (Note that * the text should not be changed concurrently with constructing * a tokenization.) * * @param cs Underlying character array. * @param start Index of first character in slice. * @param length Length of slice. * @param factory Tokenizer factory to use for tokenization. * @throws IndexOutOfBoundsException If the start and length * indices are outside of bounds of the array. */ public Tokenization(char[] cs, int start, int length, TokenizerFactory factory) { this(new String(cs,start,length), factory.tokenizer(cs,start,length)); } /** * Construct a tokenization from the specified text and tokenizer * factory. * * @param text Underlying text for tokenization. * @param factory Tokenizer factory to perform tokenization. */ public Tokenization(String text, TokenizerFactory factory) { this(text,factory.tokenizer(text.toCharArray(),0,text.length())); } /** * Construct a tokenization from the specified components. The * arrays and lists are copied so that modifications to them * will not affect the constructed object after construction. * * @param text Underlying text. * @param tokens List of tokens. * @param whitespaces List of whitespaces. * @param tokenStarts Offset of first character in tokens. * @param tokenEnds Offset of last character plus one in tokens. * @throws IllegalArgumentException If the number of whitespaces is not * equal to the number of tokens plus one, a tokens start occurs after * a token end, or a token start or end is out of bounds for the text. */ public Tokenization(String text, List tokens, List whitespaces, int[] tokenStarts, int[] tokenEnds) { this(text, new ArrayList(tokens), new ArrayList(whitespaces), tokenStarts.clone(), tokenEnds.clone(), false); if (tokens.size() != whitespaces.size() - 1) { String msg = "Require one more whitespace than token." + " Found tokens.size()=" + tokens.size() + " whitespaces.size()=" + whitespaces.size(); throw new IllegalArgumentException(msg); } if (tokenStarts.length != tokens.size()) { String msg = "Require token starts to be same length as tokens" + " Found tokenStarts.length=" + tokenStarts.length + " tokenEnds.length=" + tokenEnds.length; throw new IllegalArgumentException(msg); } if (tokenEnds.length != tokens.size()) { String msg = "Require token starts to be same length as tokens" + " Found tokenEnds.length=" + tokenEnds.length + " tokenEnds.length=" + tokenEnds.length; throw new IllegalArgumentException(msg); } for (int i = 0; i < tokenStarts.length; ++i) { if (tokenStarts[i] < 0) { String msg = "Token starts must be non-negative." + " Found tokenStarts[" + i + "]=" + tokenStarts[i]; throw new IllegalArgumentException(msg); } if (tokenEnds[i] > text.length()) { String msg = "Token ends must be less than or equal to text length." + " Found tokenEnds[" + i + "]=" + tokenEnds[i] + " text.length()=" + text.length(); throw new IllegalArgumentException(msg); } if (tokenStarts[i] > tokenEnds[i]) { String msg = "Token starts must be less than or equal to ends." + " Found tokenStarts[" + i + "]=" + tokenStarts[i] + " tokenEnds[" + i + "]=" + tokenEnds[i]; throw new IllegalArgumentException(msg); } } } Tokenization(String text, List tokens, List whitespaces, int[] tokenStarts, int[] tokenEnds, boolean ignore) { // dummy var to distinguish constructor mText = text; mTokens = tokens; mWhitespaces = whitespaces; mTokenStarts = tokenStarts; mTokenEnds = tokenEnds; } Tokenization(String text, Tokenizer tokenizer) { mText = text; List tokens = new ArrayList(); List whitespaces = new ArrayList(); List starts = new ArrayList(); List ends = new ArrayList(); String token; whitespaces.add(tokenizer.nextWhitespace()); while ((token = tokenizer.nextToken()) != null) { tokens.add(token); whitespaces.add(tokenizer.nextWhitespace()); starts.add(tokenizer.lastTokenStartPosition()); ends.add(tokenizer.lastTokenEndPosition()); } mTokens = tokens; mWhitespaces = whitespaces; mTokenStarts = new int[starts.size()]; mTokenEnds = new int[starts.size()]; for (int i = 0; i < starts.size(); ++i) { mTokenStarts[i] = starts.get(i); mTokenEnds[i] = ends.get(i); } } /** * Return the underlying text for this tokenization. * * @return Text for tokenization. */ public String text() { return mText; } /** * Return the number of tokens in this tokenization. * * @return The number of tokens. */ public int numTokens() { return mTokens.size(); } /** * Return the token at the specified input position. * * @param n Position of token. * @return Token at specified position. * @throws IndexOutOfBoundsException If the position is less than 0 or * greater than or equal to the number of tokens. */ public String token(int n) { return mTokens.get(n); } /** * Return the whitespace before the token at the specified * input position, or the last whitespace if the specified * position is the number of tokens. * * @param n Position of token. * @return Whitespace before the token in the specified position. * @throws IndexOutOfBoundsException If the position is less than 0 * or greater than the number of tokens. */ public String whitespace(int n) { return mWhitespaces.get(n); } /** * Return the position of the first character in the specified * input position. * * @param n Position of token. * @return The index of the first character in the specified * token. * @throws IndexOutOfBoundsException If the position is less than 0 or * greater than or equal to the number of tokens. */ public int tokenStart(int n) { return mTokenStarts[n]; } /** * Return the position of one past the last character in the * specified input position. * * @param n Position of token. * @return The index of the last character plus one for the * specified token. * @throws IndexOutOfBoundsException If the position is less than 0 or * greater than or equal to the number of tokens. */ public int tokenEnd(int n) { return mTokenEnds[n]; } /** * Returns the array of tokens underlying this tokenization. This * array's length is the number of tokens and it is indexed by * token position. * *

The array is copied from the underlying list of tokens, so * modifying it will not affect this tokenization. * * @return Array of tokens for this tokenization. */ public String[] tokens() { return mTokens.toArray(Strings.EMPTY_STRING_ARRAY); } /** * Return the array of whitespaces for this tokenization. * The array's length is one greater than the number of tokens, and it * is indexed by following token position. * *

The array is copied from the underlying list of tokens, so * modifying it will not affect this tokenization. * * @return Array of whitespaces for this tokenization. */ public String[] whitespaces() { return mWhitespaces.toArray(Strings.EMPTY_STRING_ARRAY); } /** * Returns an unmodifiable view of the list of tokens * for this tokenization. * * @return List of tokens for this tokenization. */ public List tokenList() { return Collections.unmodifiableList(mTokens); } /** * Returns an unmodifiable view of the list of whitespaces * for this tokenization. * * @return List of whitespaces for this tokenization. */ public List whitespaceList() { return Collections.unmodifiableList(mWhitespaces); } /** * Returns {@code true} if the specified object is a tokenization * that is equal to this one. Equality is defined as having the * same text, tokens, whitespaces, and token start and end positions. */ @Override public boolean equals(Object that) { if (this == that) return true; if (!(that instanceof Tokenization)) return false; Tokenization thatT = (Tokenization) that; if (!text().equals(thatT.text())) return false; if (numTokens() != thatT.numTokens()) return false; for (int n = 0; n < numTokens(); ++n) { if (!token(n).equals(thatT.token(n))) return false; if (!whitespace(n).equals(thatT.whitespace(n))) return false; if (tokenStart(n) != thatT.tokenStart(n)) return false; if (tokenEnd(n) != thatT.tokenEnd(n)) return false; } if (!whitespace(numTokens()).equals(thatT.whitespace(numTokens()))) return false; return true; } /** * Returns the hash code for this tokenization. The hash code is * consistent with equality, but only considers the text and * number of tokens. * * @return The hash code for this tokenization. */ @Override public int hashCode() { return 31 * mText.hashCode() + mTokens.size(); } Object writeReplace() { return new Serializer(this); } static class Serializer extends AbstractExternalizable { static final long serialVersionUID = 5248361056143805108L; Tokenization mToks; public Serializer() { this(null); } public Serializer(Tokenization toks) { mToks = toks; } public void writeExternal(ObjectOutput out) throws IOException { out.writeUTF(mToks.mText); out.writeInt(mToks.mTokens.size()); for (String token : mToks.mTokens) out.writeUTF(token); for (String whitespace : mToks.mWhitespaces) out.writeUTF(whitespace); writeInts(mToks.mTokenStarts,out); writeInts(mToks.mTokenEnds,out); } public Object read(ObjectInput in) throws IOException { String text = in.readUTF(); int len = in.readInt(); List tokens = new ArrayList(len); for (int i = 0; i < len; ++i) tokens.add(in.readUTF()); List whitespaces = new ArrayList(len+1); for (int i = 0; i <= len; ++i) whitespaces.add(in.readUTF()); int[] tokenStarts = readInts(in); int[] tokenEnds = readInts(in); boolean ignoreMe = true; return new Tokenization(text,tokens,whitespaces,tokenStarts,tokenEnds,ignoreMe); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy