scripts.algorithms.Kmeans-predict.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# Compares two categorical data vectors (presumed to be clusterings) and
# counts matching/nonmatching same-cluster/different-cluster pairs of rows
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------
# NAME  TYPE   DEFAULT  MEANING
# ---------------------------------------------------------------------------
# spY   String  " "     Location to read a column-vector with the "specified"
#                       assignment of records (rows) to categories (clusters)
# prY   String  " "     Location to read (or write, if X and C are present) a
#                       column-vector with the "predicted" assignment of rows
#                       to clusters.  NOTE: The same category may be labeled
#                       differently in each of the two vectors, spY and prY.
# fmt   String "text"   Matrix output format for prY, usually "text" or "csv"
# X     String  " "     Location to read matrix X with the input data records
# C     String  " "     Location to read matrix C with the cluster centroids
#                       NOTE: If X and C are present, prY is an output file.
# O     String  " "     Location to write the printed output statistics
# ---------------------------------------------------------------------------
#
# The "O" file provides the output statistics in CSV format, one per line, in
# the following format: NAME, [CID], VALUE.  Note:
#   - The 1st group statistics are given if X input is available;
#   - The 2nd group statistics are given if X and C inputs are available;
#   - The 3rd and 4th group statistics are given if spY input is available;
#   - Only the 4th group statistics contain a nonempty CID value;
#   - When present, CID contains either the specified category label or the
#     predicted cluster label.
#
# NAME            CID   MEANING
# ---------------------------------------------------------------------------
# TSS                   Total Sum of Squares (from the total mean)
# WCSS_M                Within-Cluster  Sum of Squares (means as centers)
# WCSS_M_PC             Within-Cluster  Sum of Squares (means), in % of TSS
# BCSS_M                Between-Cluster Sum of Squares (means as centers)
# BCSS_M_PC             Between-Cluster Sum of Squares (means), in % of TSS
#
# WCSS_C                Within-Cluster  Sum of Squares (centroids as centers)
# WCSS_C_PC             Within-Cluster  Sum of Squares (centroids), % of TSS
# BCSS_C                Between-Cluster Sum of Squares (centroids as centers)
# BCSS_C_PC             Between-Cluster Sum of Squares (centroids), % of TSS
# 
# TRUE_SAME_CT          Same-category pairs predicted as Same-cluster, count
# TRUE_SAME_PC          Same-category pairs predicted as Same-cluster, %
# TRUE_DIFF_CT          Diff-category pairs predicted as Diff-cluster, count
# TRUE_DIFF_PC          Diff-category pairs predicted as Diff-cluster, %
# FALSE_SAME_CT         Diff-category pairs predicted as Same-cluster, count
# FALSE_SAME_PC         Diff-category pairs predicted as Same-cluster, %
# FALSE_DIFF_CT         Same-category pairs predicted as Diff-cluster, count
# FALSE_DIFF_PC         Same-category pairs predicted as Diff-cluster, %
# 
# SPEC_TO_PRED     +    For specified category, the best predicted cluster id
# SPEC_FULL_CT     +    For specified category, its full count
# SPEC_MATCH_CT    +    For specified category, best-cluster matching count
# SPEC_MATCH_PC    +    For specified category, % of matching to full count
# PRED_TO_SPEC     +    For predicted cluster, the best specified category id
# PRED_FULL_CT     +    For predicted cluster, its full count
# PRED_MATCH_CT    +    For predicted cluster, best-category matching count
# PRED_MATCH_PC    +    For predicted cluster, % of matching to full count
# ---------------------------------------------------------------------------
#
# Examples:
# 1. To predict Y given X and C:
#     hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
#         C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
# 2. To compare "actual" labels spY with "predicted" labels given X and C:
#     hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
#         C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
# 3. To compare "actual" labels spY with given "predicted" labels prY:
#     hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
#         prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats


fmt_prY = ifdef ($fmt, "text");
filePrY = ifdef ($prY, " ");
fileSpY = ifdef ($spY, " ");
fileX   = ifdef ($X, " ");
fileC   = ifdef ($C, " ");
fileO   = ifdef ($O, " ");

is_str_empty = TRUE;
str = " ";

print ("BEGIN K-MEANS SCORING SCRIPT");

if (fileX != " ") {
    print ("Reading X...");
    X = read (fileX);
    total_mean = colSums (X) / nrow (X);
    total_ss = sum( (X - total_mean)^2 );
}

if ((fileC != " ") & (fileX == " ")) {
    print ("ERROR: Cannot provide C without providing X.");
} else {


if (fileC != " ") {
    print ("Reading C...");
    C = read (fileC);
    num_clusters = nrow (C);
    ones_C = matrix (1, rows = num_clusters, cols = 1);
    print ("Computing the predicted Y...");
    D =  -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
    prY = rowIndexMin (D);
    if (filePrY != " ") {
        print ("Writing the predicted Y...");
        write (prY, filePrY, format=fmt_prY);
    }
} else {
    print ("Reading the predicted Y...");
    prY = read (filePrY);
    num_clusters = max (prY);
    ones_C = matrix (1, rows = num_clusters, cols = 1);
}

if (fileX != " ") {
    print ("Computing the WCSS...");
    # Compute projection matrix from clusters to records
    P = matrix (0, rows = nrow (X), cols = num_clusters);
    P [, 1 : max (prY)] = table (seq (1, nrow (X), 1), prY);
    # Compute the means, as opposed to the centroids
    cluster_sizes = t(colSums (P));
    record_of_ones = matrix (1, rows = 1, cols = ncol (X));
    M = (t(P) %*% X) / ((cluster_sizes + (cluster_sizes == 0)) %*% record_of_ones);
    # Compute the WCSS for the means
    wcss_means = sum ((X - P %*% M) ^ 2);
    wcss_means_pc = 100.0 * wcss_means / total_ss;
    bcss_means = sum (cluster_sizes * rowSums ((M - ones_C %*% total_mean) ^ 2));
    bcss_means_pc = 100.0 * bcss_means / total_ss;
    # Output results
    print ("Total Sum of Squares (TSS) = " + total_ss);
    print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
    print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
    str = "TSS,," + total_ss;
    str = append (str, "WCSS_M,," + wcss_means);
    str = append (str, "WCSS_M_PC,," + wcss_means_pc);
    str = append (str, "BCSS_M,," + bcss_means);
    str = append (str, "BCSS_M_PC,," + bcss_means_pc);
    is_str_empty = FALSE;
}

if (fileC != " ") {        
    # Compute the WCSS for the centroids
    wcss_centroids = sum ((X - P %*% C) ^ 2);
    wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
    bcss_centroids = sum (cluster_sizes * rowSums ((C - ones_C %*% total_mean) ^ 2));
    bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
    # Output results
    print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
    print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
    str = append (str, "WCSS_C,," + wcss_centroids);
    str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
    str = append (str, "BCSS_C,," + bcss_centroids);
    str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
}



if (fileSpY != " ") {

print ("Reading the specified Y...");
spY = read (fileSpY);
num_records = nrow (spY);
    
if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) {
    print ("ERROR: spY and/or prY size mismatch");
    print ("nrow (spY) = " + nrow (spY) + ";  ncol (spY) = " + ncol (spY)
      + ";  nrow (prY) = " + nrow (prY) + ";  ncol (prY) = " + ncol (prY));
} else {

    print ("Computing the pairs counts...");

    orig_min_spY = min (spY);
    orig_min_prY = min (prY);
    spY = spY + (1 - orig_min_spY);
    prY = prY + (1 - orig_min_prY);
    
    spYprY_row_counts = table (spY, prY);
    spY_row_counts = rowSums (spYprY_row_counts);
    prY_row_counts = t(colSums (spYprY_row_counts));

    # Count all pairs of rows having the same (spY, prY)-values
    spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;

    # Count all pairs of rows having the same spY-values
    spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
    # Count all pairs of rows having the same prY-values
    prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;

    num_pairs = num_records * (num_records - 1.0) / 2.0;

    num_TP_pairs = sum (spYprY_pair_counts);
    num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
    num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
    num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;
    
    pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
    pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
    pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
    pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;
    
    if (is_str_empty) {
        str = "TRUE_SAME_CT,," + num_TP_pairs;
        is_str_empty = FALSE;
    } else {
        str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
    } 
    str = append (str, "TRUE_SAME_PC,,"  + pct_TP_pairs);
    str = append (str, "TRUE_DIFF_CT,,"  + num_TN_pairs);
    str = append (str, "TRUE_DIFF_PC,,"  + pct_TN_pairs);
    str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
    str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
    str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
    str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);
    
    pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
    pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
    pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
    pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;
    
    space_TP = "";  if (pct_TP_pairs < 100) {space_TP = " ";}  if (pct_TP_pairs < 10) {space_TP = "  ";}
    space_TN = "";  if (pct_TN_pairs < 100) {space_TN = " ";}  if (pct_TN_pairs < 10) {space_TN = "  ";}
    space_FP = "";  if (pct_FP_pairs < 100) {space_FP = " ";}  if (pct_FP_pairs < 10) {space_FP = "  ";}
    space_FN = "";  if (pct_FN_pairs < 100) {space_FN = " ";}  if (pct_FN_pairs < 10) {space_FN = "  ";}

    print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
        + pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
    print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
        + pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
    print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
        + pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
    print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
        + pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");
        
    [spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
        get_best_assignments (spYprY_row_counts);
        
    print (" ");
    print ("Specified Categories versus Predicted Clusters:");
    
    spY_cids = spY_cids + orig_min_spY - 1;
    prY_cids = prY_cids + orig_min_prY - 1;
    
    for (i in 1 : nrow (spY_cids))
    {
        cid = as.integer (as.scalar (spY_cids [i, 1]));
        pct = as.scalar (rounded_percentages [i, 1]);
        space_pct = "";  if (pct < 100) {space_pct = " ";}  if (pct < 10) {space_pct = "  ";}
        print ("Category " + cid + 
            ":  best pred. cluster is " + as.integer (as.scalar (prY_cids [i, 1])) + 
            ";  full count = " + as.integer (as.scalar (full_counts [i, 1])) + 
            ",  matching count = " + space_pct + pct + "% (" +
            as.integer (as.scalar (matching_counts [i, 1])) + ")");
            
        str = append (str, "SPEC_TO_PRED,"  + cid + "," + as.scalar (prY_cids [i, 1]));
        str = append (str, "SPEC_FULL_CT,"  + cid + "," + as.scalar (full_counts [i, 1]));
        str = append (str, "SPEC_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
        str = append (str, "SPEC_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
    }

    [prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
        get_best_assignments (t(spYprY_row_counts));
        
    print (" ");
    print ("Predicted Clusters versus Specified Categories:");
    
    prY_cids = prY_cids + orig_min_prY - 1;
    spY_cids = spY_cids + orig_min_spY - 1;
    
    for (i in 1 : nrow (prY_cids))
    {
        cid = as.integer (as.scalar (prY_cids [i, 1]));
        pct = as.scalar (rounded_percentages [i, 1]);
        space_pct = "";  if (pct < 100) {space_pct = " ";}  if (pct < 10) {space_pct = "  ";}
        print ("Cluster " + cid + 
            ":  best spec. categ. is " + as.integer (as.scalar (spY_cids [i, 1])) + 
            ";  full count = " + as.integer (as.scalar (full_counts [i, 1])) + 
            ",  matching count = " + space_pct + pct + "% (" +
            as.integer (as.scalar (matching_counts [i, 1])) + ")");

        str = append (str, "PRED_TO_SPEC,"  + cid + "," + as.scalar (spY_cids [i, 1]));
        str = append (str, "PRED_FULL_CT,"  + cid + "," + as.scalar (full_counts [i, 1]));
        str = append (str, "PRED_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
        str = append (str, "PRED_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
    }

    print (" ");
}}}

if ((fileO != " ") & (! is_str_empty)) {
    write (str, fileO);
}

print ("DONE: K-MEANS SCORING SCRIPT");



get_best_assignments = function (Matrix[double] counts)
return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins, 
        Matrix[double] max_counts, Matrix[double] rounded_percentages)
{
    margins = rowSums (counts);
    select_positive = diag (margins > 0);
    select_positive = removeEmpty (target = select_positive, margin = "rows");
    row_ids = select_positive %*% seq (1, nrow (margins), 1);
    pos_counts = select_positive %*% counts;
    pos_margins = select_positive %*% margins;
    max_counts = rowMaxs (pos_counts);
    one_per_column = matrix (1, rows = 1, cols = ncol (pos_counts));
    max_counts_ppred = max_counts %*% one_per_column;
    is_max_count = (pos_counts == max_counts_ppred);
    aggr_is_max_count = t(cumsum (t(is_max_count)));
    col_ids = rowSums (aggr_is_max_count == 0) + 1;
    rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
}