scripts.algorithms.ALS_topk_predict.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#  
# THIS SCRIPT COMPUTES THE RATING/SCORE FOR A GIVEN LIST OF PAIRS: (USER-ID, ITEM-ID) USING 2 FACTOR MATRICES L AND R
# WE ASSUME THAT ALL USERS HAVE RATED AT LEAST ONCE AND ALL ITEMS HAVE BEEN RATED AT LEAST ONCE.
# INPUT   PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME    TYPE     DEFAULT  MEANING
# ---------------------------------------------------------------------------------------------
# X       String   ---      Location to read the input user-ids list
# Y	 	  String   ---	    Location to write the output of top-K prediction: 
#							 - top-K item-ids will be stored at Y
#							 - the corresponding top-K ratings will be stored at Y+".ratings" 
# L       String   ---      Location of factor matrix L: user-id x feature-id 
# R       String   ---      Location of factor matrix R: feature-id x item-id
# V	  	  String   ---      Location of original matrix V: user-id x item-id
# K	  	  Int      5	    The number of top-K items	
# fmt     String   "text"   The output format of the factor matrix user-id/item-id/score
# ---------------------------------------------------------------------------------------------
# OUTPUT: 
# 1- A matrix containing the top-K item-ids with highest predicted ratings for the users specified in the input matrix X  
# 2- A matrix containing the top-K predicted ratings for the users specified in the input matrix X  
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar system-ml.jar -f ALS-topk-predict.dml -nvargs X=INPUT_DIR/X L=INPUT_DIR/L R=INPUT_DIR/R V=INTPUT_DIR/V.mtx 
#													Y=OUTPUT_DIR/Y K=5 fmt=csv

fileX      = $X;
fileY 	   = $Y;
fileL	   = $L;
fileR      = $R;
fileV	   = $V;
K	  	   = ifdef ($K, 5);
fmtO       = ifdef ($fmt, "text");    # $fmt="text";

X = read (fileX);
L = read (fileL);
R = read (fileR);
V = read (fileV);

Vrows = nrow(V);
Vcols = ncol(V);

zero_cols_ind = (colSums (R != 0)) == 0;
K = min (Vcols - sum (zero_cols_ind), K);

n = nrow(X);

Lrows = nrow(L);
Rcols = ncol(R);

X_user_max = max(X[,1]);

if (X_user_max > Vrows) {
	stop ("Predictions cannot be provided. Maximum user-id exceeds the number of rows of V.");
}
if (Lrows != Vrows | Rcols !=  Vcols) {
	stop ("Predictions cannot be provided. Number of rows of L (columns of R) does not match the number of rows (column) of V.");
}


# creats projection matrix to select users
s = seq(1, n);
ones = matrix (1, rows = n, cols = 1);
projection_matrix = table(s, X[,1], ones, n, Lrows);

# selects users from factor L
U_prime = projection_matrix %*% L;

# calculates V_filter for selected users
V_filter = U_prime %*% R;

# selects users from original V
V_prime = projection_matrix %*% V;

# filter for already recommended items
V_prime = V_prime == 0;

# removes already recommended items and creating user2item matrix
V_filter = V_prime * V_filter; 


# stores sorted movies for selected users 
V_top_indices = matrix(0, rows = nrow (V_filter), cols = K);
V_top_values = matrix(0, rows = nrow (V_filter), cols = K);

# a large number to mask the max ratings
range = max (V_filter) - min (V_filter) + 1;

# uses rowIndexMax/rowMaxs to update kth ratings
for (i in 1:K){
	rowIndexMax = rowIndexMax (V_filter);
	rowMaxs = rowMaxs (V_filter);
	V_top_indices[,i] = rowIndexMax;
	V_top_values[,i] = rowMaxs;
	V_filter = V_filter - range * table (seq (1, nrow (V_filter), 1), rowIndexMax, nrow(V_filter), ncol(V_filter));
}

V_top_indices = V_top_indices * (V_top_values > 0);

# append users as a first column
V_top_indices = append (X[,1], V_top_indices);
V_top_values = append (X[,1], V_top_values);

# writing top K elements
write (V_top_indices, fileY, format = fmtO);
write(V_top_values, fileY+".ratings", format = fmtO);