All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.lm.TrieIntSeqCounter Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.lm;

import com.aliasi.corpus.ObjectHandler;

import com.aliasi.util.ObjectToCounterMap;

/**
 * An TrieIntSeqCounter implements an integer sequence
 * counter with a trie structure of counts.
 *
 * 

Implementation Note: This trie-based integer sequence * counter is not as tight in memory as the character tries, but is * much more efficient for nodes with many daughters. It unfolds * 1-daughter and 2-daughter nodes, and beyond that uses * balanced binary trees (via java.util.TreeMap) * * @author Bob Carpenter * @version 3.9 * @since LingPipe2.0 */ public class TrieIntSeqCounter implements IntSeqCounter { private final int mMaxLength; final IntNode mRootNode; /** * Construct an integer sequence counter for subsequences * up to the specified maximum length. * * @param maxLength Maximum length of subsequences counted. * @throws IllegalArgumentException If the maximum length is * less than zero. */ public TrieIntSeqCounter(int maxLength) { if (maxLength < 0) { String msg = "Max length must be >= 0." + " Found maxLength=" + maxLength; throw new IllegalArgumentException(msg); } mMaxLength = maxLength; mRootNode = new IntNode(); } /** * Removes all counts for sequences that are less than the minimum * count. This operation is safe in that it will never remove the * root node. Pruning is idempotent in that pruning twice with * the same count has no effect. * * @param minCount Minimum count to maintain a node. */ public void prune(int minCount) { mRootNode.prune(minCount); } /** * Rescales all counts by multiplying them by the specified * factor. Counts are rounded down by casting back to * int after being multipled by the scaling factor: * *

* count=(int)(count*countMultiplier) *
* * Unlike pruning, scaling has a cumulative effect and is not * idempotent. For instance, a count of four scaled by half once * will be two, and scaled by half twice will be one. Because of * rounding, it's not even guaranteed that rescaling twice, * rescale(0.5); rescale(0.5);, returns the same * result as rescaling with the combined factor, * rescale(0.25);. * *

Also unlike pruning, scaling, because of the integer * rounding, may change the ratios between surviving counts. * For instance, under scaling by 0.5, both 3 and 2 rescale * to 1. * * @param countMultiplier Amount by which counts are scaled. */ public void rescale(double countMultiplier) { mRootNode.rescale(countMultiplier); } /** * Returns the maximum length of subsequence of integers being * counted. * * @return The maximum length of subsequence of integers being * counted. */ public int maxLength() { return mMaxLength; } /** * Increments the count for all subsequences of the specified * integer sequence up to the specified maximum length. For * instance, calling * incrementSubsequences({1,3,17,8,122},1,4) with a * maximum length of 2 increments the bigram sequence * counts {3,17}, {17,8} and the unigram * sequence counts {3}, {17} and * {8}. * * @param is Underlying array of integers. * @param start Index of first integer in the slice. * @param end Index of one past the last integer in the slice. * @throws IndexOutOfBoundsException If the start and end minus one * indices do not fall within the range of the integer array. */ public void incrementSubsequences(int[] is, int start, int end) { checkBoundaries(is,start,end); for (int i = start; i < end; ++i) mRootNode.increment(is,i,Math.min(i+maxLength(),end)); } /** * Increments the count for all subsequences of the specified * integer sequence up to the specified maximum length with the * specified count. Calling * incrementSubsequences(is,start,end,n) is * equivalent to calling * incrementSubsequences(is,start,end) a total of * n times. * * @param is Underlying array of integers. * @param start Index of first integer in the slice. * @param end Index of one past the last integer in the slice. * @param count * @throws IndexOutOfBoundsException If the start and end minus one * indices do not fall within the range of the integer array. * @throws IllegalArgumentException If the count is less than zero. */ public void incrementSubsequences(int[] is, int start, int end, int count) { checkBoundaries(is,start,end); checkCount(count); if (count == 0) return; for (int i = start; i < end; ++i) mRootNode.increment(is,i,Math.min(i+maxLength(),end),count); } static void checkCount(int count) { if (count >= 0) return; String msg = "Counts must be non-negative." + " Found count=" + count; throw new IllegalArgumentException(msg); } /** * Increments the count for the specified slice by the specified * amount. For instance, calling * incrementSequence({1,2,3,4},1,3,15) results in the * sequence 2,3 having its count incremented by 15. * *

If the sequence provided is longer than the maximum sequence * counted, only its final counts are used. For example, if the * maximum length is 3, then calling * incrementSequence({1,2,3,4,5},0,5,12) is * equivalent to calling * incrementSequence({3,4,5},0,3,12). * * @param is Underlying array of integers. * @param start Index of first integer in the slice. * @param end Index of one past the last integer in the slice. * @param count * @throws IndexOutOfBoundsException If the start and end minus one * indices do not fall within the range of the integer array. * @throws IllegalArgumentException If the count is less than zero. */ public void incrementSequence(int[] is, int start, int end, int count) { checkBoundaries(is,start,end); checkCount(count); if (count == 0) return; mRootNode.incrementSequence(is, Math.max(start,end-maxLength()), end, count); } /** * Returns a histogram of counts for n-grams of integers of * the specified length, with a count of at least the specified * minimum count. The resulting counter will be empty under * if there are no n-grams in this counter of the specified length * above the specified threshold. Note that one case of this is * if the specified n-gram is greater than the maximum n-gram * length for this counter. * * @param nGram Length of n-gram whose histrogram is returned. * @param minCount Minimum count of element in histogram. * @return Histogram of counts of n-grams of the specified length with * counts above the specified minimum. * @throws IllegalArgumentException If the n-gram length is less * than 1. */ public ObjectToCounterMap nGramCounts(int nGram, int minCount) { if (nGram < 1) { String msg = "Ngrams must be positive." + " Found n-gram=" + nGram; throw new IllegalArgumentException(msg); } ObjectToCounterMap result = new ObjectToCounterMap(); int[] nGramBuffer = new int[nGram]; addNGramCounts(minCount,0,nGram,nGramBuffer,result); return result; } /** * Returns the size of this graph, measured in number of nodes * in the trie structure. This is equal to the number of * sequences of integers for which this counter stores counts. * * @return The size of this counter. */ public int trieSize() { return mRootNode.trieSize(); } /** * Supplies each n-gram of the specified length and with greater * than or equal to the specified minimum count to the specified * handler. * * @param nGram Length of n-grams to visit. * @param minCount Minimum count of visited n-gram. * @param handler Handler for visited n-grams. */ public void handleNGrams(int nGram, int minCount, ObjectHandler handler) { if (nGram < 1) { String msg = "Ngrams must be positive." + " Found n-gram=" + nGram; throw new IllegalArgumentException(msg); } int[] nGramBuffer = new int[nGram]; handleNGrams(minCount,0,nGram,nGramBuffer,handler); } public int count(int[] is, int start, int end) { checkBoundaries(is,start,end); IntNode dtr = mRootNode.getDtr(is,start,end); return dtr == null ? 0 : dtr.count(); } public long extensionCount(int[] is, int start, int end) { checkBoundaries(is,start,end); IntNode dtr = mRootNode.getDtr(is,start,end); return dtr == null ? 0l : dtr.extensionCount(); } public int numExtensions(int[] is, int start, int end) { checkBoundaries(is,start,end); IntNode dtr = mRootNode.getDtr(is,start,end); return dtr == null ? 0 : dtr.numExtensions(); } public int[] observedIntegers() { return mRootNode.observedIntegers(); } public int[] integersFollowing(int[] is, int start, int end) { return mRootNode.integersFollowing(is,start,end); } /** * Return a string-based representation of this integer sequence * counter. * * @return A string-based representation of this integer sequence * counter. */ @Override public String toString() { return mRootNode.toString(null); } void decrementUnigram(int symbol) { mRootNode.decrement(symbol); } void decrementUnigram(int symbol, int count) { mRootNode.decrement(symbol,count); } void handleNGrams(int minCount, int pos, int nGram, int[] buf, ObjectHandler handler) { int[] integersFollowing = integersFollowing(buf,0,pos); if (pos == nGram) { int count = count(buf,0,nGram); if (count < minCount) return; handler.handle(buf); return; } for (int i = 0; i < integersFollowing.length; ++i) { buf[pos] = integersFollowing[i]; handleNGrams(minCount,pos+1,nGram,buf,handler); } } void addNGramCounts(int minCount, int pos, int nGram, int[] buf, ObjectToCounterMap counter) { int[] integersFollowing = integersFollowing(buf,0,pos); if (pos == nGram) { int count = count(buf,0,nGram); if (count < minCount) return; counter.set(buf.clone(),count); return; } for (int i = 0; i < integersFollowing.length; ++i) { buf[pos] = integersFollowing[i]; addNGramCounts(minCount,pos+1,nGram,buf,counter); } } static void checkBoundaries(int[] is, int start, int end) { if (start < 0) { String msg = "Start must be in array range." + " Found start=" + start; throw new IndexOutOfBoundsException(msg); } if (end > is.length) { String msg = "End must be in array range." + " Found end=" + end + " Length=" + is.length; throw new IndexOutOfBoundsException(msg); } if (end < start) { String msg = "End must be at or after start." + " Found start=" + start + " Found end=" + end; throw new IndexOutOfBoundsException(msg); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy