All Downloads are FREE. Search and download functionalities are using the official Maven repository.

stream.parser.NGrams Maven / Gradle / Ivy

/*
 *  streams library
 *
 *  Copyright (C) 2011-2012 by Christian Bockermann, Hendrik Blom
 * 
 *  streams is a library, API and runtime environment for processing high
 *  volume data streams. It is composed of three submodules "stream-api",
 *  "stream-core" and "stream-runtime".
 *
 *  The streams library (and its submodules) is free software: you can 
 *  redistribute it and/or modify it under the terms of the 
 *  GNU Affero General Public License as published by the Free Software 
 *  Foundation, either version 3 of the License, or (at your option) any 
 *  later version.
 *
 *  The stream.ai library (and its submodules) is distributed in the hope
 *  that it will be useful, but WITHOUT ANY WARRANTY; without even the implied 
 *  warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package stream.parser;

import java.io.Serializable;
import java.util.LinkedHashMap;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import stream.AbstractProcessor;
import stream.annotations.Description;
import stream.annotations.Parameter;
import stream.data.Data;

/**
 * 
 * 
 * @author chris
 * 
 */
@Description(group = "Data Stream.Processing.Transformations.Data", text = "Creates n-grams from a specified string attribute and adds the frequencies of the resulting n-grams to the data item as (key,value) pairs.")
public class NGrams extends AbstractProcessor {
	static Logger log = LoggerFactory.getLogger(NGrams.class);
	String key = null;
	String prefix = "";
	Integer n = 3;

	@Override
	public Data process(Data data) {

		log.debug("Processing key '{}' of item {}", key, data);
		if (key != null && n != null && n >= 0) {

			Map counts = new LinkedHashMap();

			Serializable val = data.get(key);
			if (val != null) {

				String str = val.toString();
				for (int i = 0; i < str.length() - n; i++) {
					String ngram = str.substring(i, i + n);

					Double freq = counts.get(ngram);
					if (freq != null) {
						freq = freq + 1.0d;
					} else {
						freq = 1.0d;
					}
					counts.put(ngram, freq);
				}

				for (String key : counts.keySet()) {
					data.put(prefix + key, counts.get(key));
				}

				log.debug("Added {} {}-grams to item", counts.size(), n);
			}
		} else {
			log.warn("No key defined or parameter 'N' missing or smaller than 1!");
		}

		return data;
	}

	/**
	 * @return the key
	 */
	public String getKey() {
		return key;
	}

	/**
	 * @param key
	 *            the key to set
	 */
	@Parameter(required = true, description = "The attribute which is to be split into n-grams")
	public void setKey(String key) {
		this.key = key;
	}

	/**
	 * @return the n
	 */
	public Integer getN() {
		return n;
	}

	/**
	 * @param n
	 *            the n to set
	 */
	@Parameter(required = true, defaultValue = "3", description = "The length of the n-grams that are to be created")
	public void setN(Integer n) {
		this.n = n;
	}

	/**
	 * @return the prefix
	 */
	public String getPrefix() {
		return prefix;
	}

	/**
	 * @param prefix
	 *            the prefix to set
	 */
	@Parameter(required = false, defaultValue = "", description = "An optional prefix that is to be prepended for all n-gram names before these are added to the data item")
	public void setPrefix(String prefix) {
		this.prefix = prefix;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy