All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.jkernelmachines.projection.StringNGramProjection Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2016, David Picard.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation and/or
 * other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
/**
 * 
 */
package net.jkernelmachines.projection;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Utility class that computes histograms of n-grams from a string
 * 
 * @author picard
 *
 */
public class StringNGramProjection {

	/**
	 * Computes the histogram of n-grams
	 * 
	 * @param input
	 *            input string
	 * @param n
	 *            length of considered sub-sequences
	 * @return map with n-gram as keys and number of occurences as values
	 */
	public static Map computeNGram(String input, int n) {
		HashMap m1 = new HashMap<>();
		for (int i = 0; i < input.length() - n + 1; i++) {
			String s = input.substring(i, i + n);
			double d = m1.containsKey(s) ? m1.get(s) : 0;
			m1.put(s, d + 1);
		}
		return m1;
	}

	/**
	 * Computes the histogram of n-grams summing to 1
	 * 
	 * @param input
	 *            input string
	 * @param n
	 *            length of considered sub-sequences
	 * @return map with n-gram as keys and normalized number of occurences as
	 *         values
	 */
	public static Map computeNormalizedNGram(String input, int n) {
		HashMap m1 = new HashMap<>();
		double total = 0;
		for (int i = 0; i < input.length() - n + 1; i++) {
			String s = input.substring(i, i + n);
			double d = m1.containsKey(s) ? m1.get(s) : 0;
			m1.put(s, d + 1);
			total++;
		}

		for (String s : m1.keySet()) {
			m1.put(s, m1.get(s) / total);
		}

		return m1;
	}
	
	/**
	 * Generate the set of all possible n-grams composed of characters taken from the input string
	 * @param alp the original alphabet
	 * @param n dimension of the tuples
	 * @return map of possible n-grams and corresponding indexes
	 */
	public static Map generateNGramAlphabet(String alp, int n) {
		Map set = new HashMap<>();
		
		if(n <= 1) {
			for(int i = 0 ; i < alp.length() ; i++) {
				String s = ""+alp.charAt(i);
				set.put(s, i);
			}
			return set;
		}
		else {
			int index = 0;
			Map prev = generateNGramAlphabet(alp, n-1);
			for(int i = 0 ; i < alp.length() ; i++) {
				String a = ""+alp.charAt(i);
				for(String s : prev.keySet()) {
					set.put(a+s, index++);
				}
			}
			return set;
		}
	}
	
	/**
	 * computes the histogram of occurrences of n-grams gien in the alphabet
	 * @param s string to compute the histoogram
	 * @param m alphabet of n-grams and corresponding indexes in the output vector
	 * @return histogram
	 */
	public static double[] projectNGramAlphabet(String s, Map m) {
		double[] res = new double[m.size()];
		
		// assume all n-gram have same size
		int n = m.keySet().iterator().next().length();
		
		for(int i = 0 ; i < s.length()-n ; i++) {
			String k = s.substring(i, i+n);
			if(m.containsKey(k)) {
				res[m.get(k)]++;
			}
		}
		return res;
	}
	
	/**
	 * Generates the alphabet of n-grams that have a minimal number of occurences in a list of strings
	 * @param l the list of strings
	 * @param n n-gram
	 * @param thresh minimum number of occurences of the n-grams in the list
	 * @return
	 */
	public static Map generateMinimumNGramAlphabet(List l, int n, int thresh) {
		
		Map occ = computeNGram(l.get(0), n);
		for(int i = 1 ; i < l.size() ; i++) {
			Map m = computeNGram(l.get(i), n);
			for(String s : m.keySet()) {
				if(occ.containsKey(s)) {
					occ.put(s, occ.get(s)+m.get(s));
				}
				else {
					occ.put(s,  m.get(s));
				}
			}
		}
		
		int index = 0;
		Map map = new HashMap<>();
		for(String k : occ.keySet()) {
			if(occ.get(k) >= thresh) {
				map.put(k, index++);
			}
		}
		
		return map;
	}
	
	/**
	 * Generates the list of the most frequent n-grams in a list of string
	 * @param l the list
	 * @param n n-gram
	 * @param nb number of most frequent n-grams
	 * @return
	 */
	public static Map generateMostFrequentNGramAlphabet(List l , int n, int nb) {
		Map occ = computeNGram(l.get(0), n);
		for(int i = 1 ; i < l.size() ; i++) {
			Map m = computeNGram(l.get(i), n);
			for(String s : m.keySet()) {
				if(occ.containsKey(s)) {
					occ.put(s, occ.get(s)+m.get(s));
				}
				else {
					occ.put(s,  m.get(s));
				}
			}
		}
		
		int thresh = 0;
		if(occ.size() > nb) {
			List sorted = new ArrayList<>(occ.values());
			Collections.sort(sorted);
		
			thresh = (int)sorted.get(sorted.size() - nb).doubleValue();
		}
	
		
		int index = 0;
		Map map = new HashMap<>();
		for(String k : occ.keySet()) {
			if(occ.get(k) > thresh) {
				map.put(k, index++);
			}
		}
		
		return map;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy