All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.insight.ml.spark.mllib.feature.TFIDF Maven / Gradle / Ivy

package com.datastax.insight.ml.spark.mllib.feature;

import com.datastax.insight.spec.RDDOperator;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.feature.IDF;
import org.apache.spark.mllib.linalg.Vector;

import java.util.List;

public class TFIDF implements RDDOperator {
    public static JavaRDD transform(JavaRDD> data, int numFeatures, int minDocFreq){
        HashingTF hashingTF=null;
        if(numFeatures>0){
            hashingTF=new HashingTF(numFeatures);
        }else {
            hashingTF=new HashingTF();
        }
        JavaRDD vData= hashingTF.transform(data);

        IDF idf=new IDF(minDocFreq);
        return idf.fit(vData).transform(vData);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy