Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.udf.generic;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Collections;
import java.util.Iterator;
import java.util.Comparator;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* A generic, re-usable n-gram estimation class that supports partial aggregations.
* The algorithm is based on the heuristic from the following paper:
* Yael Ben-Haim and Elad Tom-Tov, "A streaming parallel decision tree algorithm",
* J. Machine Learning Research 11 (2010), pp. 849--872.
*
* In particular, it is guaranteed that frequencies will be under-counted. With large
* data and a reasonable precision factor, this undercounting appears to be on the order
* of 5%.
*/
public class NGramEstimator {
/* Class private variables */
private int k;
private int pf;
private int n;
private HashMap, Double> ngrams;
/**
* Creates a new n-gram estimator object. The 'n' for n-grams is computed dynamically
* when data is fed to the object.
*/
public NGramEstimator() {
k = 0;
pf = 0;
n = 0;
ngrams = new HashMap, Double>();
}
/**
* Returns true if the 'k' and 'pf' parameters have been set.
*/
public boolean isInitialized() {
return (k != 0);
}
/**
* Sets the 'k' and 'pf' parameters.
*/
public void initialize(int pk, int ppf, int pn) throws HiveException {
assert(pk > 0 && ppf > 0 && pn > 0);
k = pk;
pf = ppf;
n = pn;
// enforce a minimum precision factor
if(k * pf < 1000) {
pf = 1000 / k;
}
}
/**
* Resets an n-gram estimator object to its initial state.
*/
public void reset() {
ngrams.clear();
n = pf = k = 0;
}
/**
* Returns the final top-k n-grams in a format suitable for returning to Hive.
*/
public ArrayList