All Downloads are FREE. Search and download functionalities are using the official Maven repository.

scripts.utils.dataprep.dml Maven / Gradle / Ivy

There is a newer version: 1.2.0
Show newest version
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# Utility functions for common data preparation tasks

/*
 * Function to convert a frame of user ratings in triple representation of
 * alphanumeric user names, alphanumeric product names, and numeric ratings
 * into an n x m ratings matrix where n is the number of unique users, and
 * m is the number of unique products.
 * 
 * Inputs:
 *  - R: ratings in triple representation.
 *  - thrU: minimum number of ratings per user (<=0 for unconstrained).
 *  - thrP: minimum number of ratings per product (<=0 for unconstrained).
 *
 * Outputs:
 *  - X: ratings matrix.
 *  - M: encoding meta data (for user and product name reconstruction).
 * 
 * Example: amazon product ratings (http://jmcauley.ucsd.edu/data/amazon/)
 *   F = read("tmp/ratings_books.csv", data_type="frame", format="csv");
 *   [X, M] = convertToRatingsMatrix(F[, 1:3], 5, 5);
 */
convertToRatingsMatrix = function(frame[string] R, int thrU, int thrP)
  return (matrix[double] X, frame[string] M) 
{
  # recode users and products into continuous numeric ids
  jspec = "{ids:true, recode:[1,2]}";
  [TX,M] = transformencode( target = R, spec = jspec );
  
  # convert triples of user-product-rating into ratings matrix
  X_full = table( TX[,1], TX[,2], TX[,3] );

  # filter users and products if necessary (filter applies to original counts)
  X = X_full;
  if( thrU > 0 )
    X = removeEmpty(target=X, margin="rows", select=rowSums(X_full!=0)>=thrU);
  if( thrP > 0 )
    X = removeEmpty(target=X, margin="cols", select=colSums(X_full!=0)>=thrP);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy