All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.jpos.pipes.NGramGenerator Maven / Gradle / Ivy

There is a newer version: 2.3.3
Show newest version
/**
 * NGramGenerator.java
 *
 * Copyright (c) 2015, JULIE Lab.
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the GNU Lesser General Public License (LGPL) v3.0
 *
 * Author: tomanek
 *
 * Current version: 2.3
 * Since version:   2.2
 *
 * Creation date: Feb 27, 2008
 *
 * generates different kinds of ngrams
 **/

package de.julielab.jpos.pipes;

import java.util.ArrayList;

public class NGramGenerator {

	/**
	 * generates ngrams of all sizes specified in ngramSizes
	 *
	 * @param tokens
	 *            tokens of the sentence
	 * @param currPos
	 *            the current position relative to which the ngrams are to be
	 *            build
	 * @param the
	 *            ngramSizs of the ngrams
	 */
	public ArrayList generateTokenNGrams(final String[] tokens,
			final int currPos, final int[] ngramSizes) {
		final ArrayList allNGrams = new ArrayList();
		for (final int ngramSize : ngramSizes)
			allNGrams.addAll(generateTokenNGrams(tokens, currPos, ngramSize));
		return allNGrams;
	}

	/**
	 * generates ngrams of size ngramSize
	 *
	 * @param tokens
	 *            tokens of the sentence
	 * @param currPos
	 *            the current position relative to which the ngrams are to be
	 *            build
	 * @param ngramSize
	 *            the size of the ngrams
	 */
	public ArrayList generateTokenNGrams(final String[] tokens,
			final int currPos, final int ngramSize) {

		if (currPos > (tokens.length - 1))
			return null;

		final int minStart = Math.max(0, (currPos - ngramSize) + 1);
		final int maxStart = Math.min(currPos, tokens.length - 1);

		final ArrayList ngrams = new ArrayList();

		for (int i = minStart; i <= maxStart; i++)
			if ((i + ngramSize) <= tokens.length) {
				final StringBuffer ngram = new StringBuffer();
				for (int j = 0; j < ngramSize; j++)
					ngram.append(tokens[i + j] + " ");
				ngrams.add(ngram.toString().trim());
			}
		return ngrams;
	}

	public static void main(final String[] args) {
		final String[] tokens = new String[] { "0", "1", "2", "3", "4", "5" };
		// System.out.println(generateTokenNGrams(tokens, 2, 3));
		System.out.println((new NGramGenerator()).generateTokenNGrams(tokens,
				2, new int[] { 2, 3, 4 }));
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy