com.datastax.insight.ml.spark.mllib.feature.TFIDF Maven / Gradle / Ivy
package com.datastax.insight.ml.spark.mllib.feature;
import com.datastax.insight.spec.RDDOperator;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.feature.IDF;
import org.apache.spark.mllib.linalg.Vector;
import java.util.List;
public class TFIDF implements RDDOperator {
public static JavaRDD transform(JavaRDD> data, int numFeatures, int minDocFreq){
HashingTF hashingTF=null;
if(numFeatures>0){
hashingTF=new HashingTF(numFeatures);
}else {
hashingTF=new HashingTF();
}
JavaRDD vData= hashingTF.transform(data);
IDF idf=new IDF(minDocFreq);
return idf.fit(vData).transform(vData);
}
}