com.github.chen0040.sparkml.recommender.ItemCorrelationRecommender Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-ml-recommender Show documentation
Show all versions of spark-ml-recommender Show documentation
Recommender algorithms implemented in Java and for Spark
package com.github.chen0040.sparkml.recommender;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import scala.Tuple2;
import scala.Tuple3;
import scala.Tuple7;
import java.util.ArrayList;
import java.util.List;
/**
* Created by xschen on 5/6/2017.
*/
public class ItemCorrelationRecommender {
public JavaRDD fitAndTransform(JavaRDD cells){
JavaPairRDD> rdd2 = cells.mapToPair(cell -> {
String user = cell.getUser();
String movie = cell.getItem();
double rating = cell.getValue();
return new Tuple2<>(user, new Tuple2<>(movie, rating));
});
JavaPairRDD>> rdd3 = rdd2.groupByKey();
JavaPairRDD>> rdd4 = rdd3.mapValues(s -> {
List> values = new ArrayList<>();
for(Tuple2 entry : s) {
values.add(entry);
}
int count = values.size(); // number of ratings given by a user
List> result = new ArrayList<>();
for(Tuple2 entry : values) {
result.add(new Tuple3<>(entry._1(), entry._2(), count));
}
return result;
});
JavaPairRDD, Tuple7> rdd5 = rdd4.flatMapToPair(s -> {
String user = s._1();
Iterable> user_ratings = s._2();
List> values = new ArrayList<>();
for(Tuple3 entry : user_ratings) {
values.add(entry);
}
List, Tuple7>> result = new ArrayList<>();
for(int i=0; i < values.size()-1; ++i) {
Tuple3 user_rating_i = values.get(i);
String movie1 = user_rating_i._1();
double rating1 = user_rating_i._2();
int numRater1 = user_rating_i._3();
for(int j=i+1; j < values.size(); ++j) {
Tuple3 user_rating_j = values.get(j);
String movie2 = user_rating_j._1();
double rating2 = user_rating_j._2();
int numRater2 = user_rating_j._3();
double ratingProd = rating1 * rating2;
double ratingSqr1 = rating1 * rating1;
double ratingSqr2 = rating2 * rating2;
result.add(new Tuple2<>(new Tuple2<>(movie1, movie2),
new Tuple7<>(rating1, rating2, numRater1, numRater2, ratingProd, ratingSqr1, ratingSqr2)));
}
}
return result;
});
JavaPairRDD, Iterable>> rdd6 = rdd5.groupByKey();
JavaPairRDD, Iterable>> rdd7 = rdd6.filter(s -> {
Tuple2 moviePair = s._1();
return moviePair._1().compareTo(moviePair._2()) < 0;
});
return rdd7.map(t -> {
Iterable> s = t._2();
Tuple2 movie_pair = t._1();
double sumRating1 = 0;
double sumRating2 = 0;
double sumSqrRating1 = 0;
double sumSqrRating2 = 0;
double sumRatingProd = 0;
int N = 0;
int maxNumRaters1 = 0;
int maxNumRaters2 = 0;
for(Tuple7 entry : s) {
sumRating1 += entry._1();
sumRating2 += entry._2();
maxNumRaters1 = Math.max(maxNumRaters1, entry._3());
maxNumRaters2 = Math.max(maxNumRaters2, entry._4());
sumRatingProd += entry._5();
sumSqrRating1 += entry._6();
sumSqrRating2 += entry._7();
N++;
}
double pearson = (N * sumRatingProd - sumRating1 * sumRating2) / (Math.sqrt(N * sumSqrRating1 - sumRating1 * sumRating1) * Math.sqrt(N * sumSqrRating2 - sumRating2 * sumRating2));
double jaccard = (double)N / (maxNumRaters1 + maxNumRaters2 - N);
double cosine = sumRatingProd / (Math.sqrt(sumSqrRating1) * Math.sqrt(sumSqrRating2));
ItemCorrelation result = new ItemCorrelation();
result.setCosine(cosine);
result.setPearson(pearson);
result.setJaccard(jaccard);
result.setItem1(movie_pair._1());
result.setItem2(movie_pair._2());
return result;
});
}
}