scripts.algorithms.Kmeans-predict.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Compares two categorical data vectors (presumed to be clusterings) and
# counts matching/nonmatching same-cluster/different-cluster pairs of rows
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------
# spY String " " Location to read a column-vector with the "specified"
# assignment of records (rows) to categories (clusters)
# prY String " " Location to read (or write, if X and C are present) a
# column-vector with the "predicted" assignment of rows
# to clusters. NOTE: The same category may be labeled
# differently in each of the two vectors, spY and prY.
# fmt String "text" Matrix output format for prY, usually "text" or "csv"
# X String " " Location to read matrix X with the input data records
# C String " " Location to read matrix C with the cluster centroids
# NOTE: If X and C are present, prY is an output file.
# O String " " Location to write the printed output statistics
# ---------------------------------------------------------------------------
#
# The "O" file provides the output statistics in CSV format, one per line, in
# the following format: NAME, [CID], VALUE. Note:
# - The 1st group statistics are given if X input is available;
# - The 2nd group statistics are given if X and C inputs are available;
# - The 3rd and 4th group statistics are given if spY input is available;
# - Only the 4th group statistics contain a nonempty CID value;
# - When present, CID contains either the specified category label or the
# predicted cluster label.
#
# NAME CID MEANING
# ---------------------------------------------------------------------------
# TSS Total Sum of Squares (from the total mean)
# WCSS_M Within-Cluster Sum of Squares (means as centers)
# WCSS_M_PC Within-Cluster Sum of Squares (means), in % of TSS
# BCSS_M Between-Cluster Sum of Squares (means as centers)
# BCSS_M_PC Between-Cluster Sum of Squares (means), in % of TSS
#
# WCSS_C Within-Cluster Sum of Squares (centroids as centers)
# WCSS_C_PC Within-Cluster Sum of Squares (centroids), % of TSS
# BCSS_C Between-Cluster Sum of Squares (centroids as centers)
# BCSS_C_PC Between-Cluster Sum of Squares (centroids), % of TSS
#
# TRUE_SAME_CT Same-category pairs predicted as Same-cluster, count
# TRUE_SAME_PC Same-category pairs predicted as Same-cluster, %
# TRUE_DIFF_CT Diff-category pairs predicted as Diff-cluster, count
# TRUE_DIFF_PC Diff-category pairs predicted as Diff-cluster, %
# FALSE_SAME_CT Diff-category pairs predicted as Same-cluster, count
# FALSE_SAME_PC Diff-category pairs predicted as Same-cluster, %
# FALSE_DIFF_CT Same-category pairs predicted as Diff-cluster, count
# FALSE_DIFF_PC Same-category pairs predicted as Diff-cluster, %
#
# SPEC_TO_PRED + For specified category, the best predicted cluster id
# SPEC_FULL_CT + For specified category, its full count
# SPEC_MATCH_CT + For specified category, best-cluster matching count
# SPEC_MATCH_PC + For specified category, % of matching to full count
# PRED_TO_SPEC + For predicted cluster, the best specified category id
# PRED_FULL_CT + For predicted cluster, its full count
# PRED_MATCH_CT + For predicted cluster, best-category matching count
# PRED_MATCH_PC + For predicted cluster, % of matching to full count
# ---------------------------------------------------------------------------
#
# Examples:
# 1. To predict Y given X and C:
# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
# C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
# 2. To compare "actual" labels spY with "predicted" labels given X and C:
# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
# C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
# 3. To compare "actual" labels spY with given "predicted" labels prY:
# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
# prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats
fmt_prY = ifdef ($fmt, "text");
filePrY = ifdef ($prY, " ");
fileSpY = ifdef ($spY, " ");
fileX = ifdef ($X, " ");
fileC = ifdef ($C, " ");
fileO = ifdef ($O, " ");
is_str_empty = TRUE;
str = " ";
print ("BEGIN K-MEANS SCORING SCRIPT");
if (fileX != " ") {
print ("Reading X...");
X = read (fileX);
total_mean = colSums (X) / nrow (X);
total_ss = sum( (X - total_mean)^2 );
}
if ((fileC != " ") & (fileX == " ")) {
print ("ERROR: Cannot provide C without providing X.");
} else {
if (fileC != " ") {
print ("Reading C...");
C = read (fileC);
num_clusters = nrow (C);
ones_C = matrix (1, rows = num_clusters, cols = 1);
print ("Computing the predicted Y...");
D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
prY = rowIndexMin (D);
if (filePrY != " ") {
print ("Writing the predicted Y...");
write (prY, filePrY, format=fmt_prY);
}
} else {
print ("Reading the predicted Y...");
prY = read (filePrY);
num_clusters = max (prY);
ones_C = matrix (1, rows = num_clusters, cols = 1);
}
if (fileX != " ") {
print ("Computing the WCSS...");
# Compute projection matrix from clusters to records
P = matrix (0, rows = nrow (X), cols = num_clusters);
P [, 1 : max (prY)] = table (seq (1, nrow (X), 1), prY);
# Compute the means, as opposed to the centroids
cluster_sizes = t(colSums (P));
record_of_ones = matrix (1, rows = 1, cols = ncol (X));
M = (t(P) %*% X) / ((cluster_sizes + (cluster_sizes == 0)) %*% record_of_ones);
# Compute the WCSS for the means
wcss_means = sum ((X - P %*% M) ^ 2);
wcss_means_pc = 100.0 * wcss_means / total_ss;
bcss_means = sum (cluster_sizes * rowSums ((M - ones_C %*% total_mean) ^ 2));
bcss_means_pc = 100.0 * bcss_means / total_ss;
# Output results
print ("Total Sum of Squares (TSS) = " + total_ss);
print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
str = "TSS,," + total_ss;
str = append (str, "WCSS_M,," + wcss_means);
str = append (str, "WCSS_M_PC,," + wcss_means_pc);
str = append (str, "BCSS_M,," + bcss_means);
str = append (str, "BCSS_M_PC,," + bcss_means_pc);
is_str_empty = FALSE;
}
if (fileC != " ") {
# Compute the WCSS for the centroids
wcss_centroids = sum ((X - P %*% C) ^ 2);
wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
bcss_centroids = sum (cluster_sizes * rowSums ((C - ones_C %*% total_mean) ^ 2));
bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
# Output results
print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
str = append (str, "WCSS_C,," + wcss_centroids);
str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
str = append (str, "BCSS_C,," + bcss_centroids);
str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
}
if (fileSpY != " ") {
print ("Reading the specified Y...");
spY = read (fileSpY);
num_records = nrow (spY);
if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) {
print ("ERROR: spY and/or prY size mismatch");
print ("nrow (spY) = " + nrow (spY) + "; ncol (spY) = " + ncol (spY)
+ "; nrow (prY) = " + nrow (prY) + "; ncol (prY) = " + ncol (prY));
} else {
print ("Computing the pairs counts...");
orig_min_spY = min (spY);
orig_min_prY = min (prY);
spY = spY + (1 - orig_min_spY);
prY = prY + (1 - orig_min_prY);
spYprY_row_counts = table (spY, prY);
spY_row_counts = rowSums (spYprY_row_counts);
prY_row_counts = t(colSums (spYprY_row_counts));
# Count all pairs of rows having the same (spY, prY)-values
spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;
# Count all pairs of rows having the same spY-values
spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
# Count all pairs of rows having the same prY-values
prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;
num_pairs = num_records * (num_records - 1.0) / 2.0;
num_TP_pairs = sum (spYprY_pair_counts);
num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;
pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;
if (is_str_empty) {
str = "TRUE_SAME_CT,," + num_TP_pairs;
is_str_empty = FALSE;
} else {
str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
}
str = append (str, "TRUE_SAME_PC,," + pct_TP_pairs);
str = append (str, "TRUE_DIFF_CT,," + num_TN_pairs);
str = append (str, "TRUE_DIFF_PC,," + pct_TN_pairs);
str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);
pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;
space_TP = ""; if (pct_TP_pairs < 100) {space_TP = " ";} if (pct_TP_pairs < 10) {space_TP = " ";}
space_TN = ""; if (pct_TN_pairs < 100) {space_TN = " ";} if (pct_TN_pairs < 10) {space_TN = " ";}
space_FP = ""; if (pct_FP_pairs < 100) {space_FP = " ";} if (pct_FP_pairs < 10) {space_FP = " ";}
space_FN = ""; if (pct_FN_pairs < 100) {space_FN = " ";} if (pct_FN_pairs < 10) {space_FN = " ";}
print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
+ pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
+ pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
+ pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
+ pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");
[spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
get_best_assignments (spYprY_row_counts);
print (" ");
print ("Specified Categories versus Predicted Clusters:");
spY_cids = spY_cids + orig_min_spY - 1;
prY_cids = prY_cids + orig_min_prY - 1;
for (i in 1 : nrow (spY_cids))
{
cid = as.integer (as.scalar (spY_cids [i, 1]));
pct = as.scalar (rounded_percentages [i, 1]);
space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
print ("Category " + cid +
": best pred. cluster is " + as.integer (as.scalar (prY_cids [i, 1])) +
"; full count = " + as.integer (as.scalar (full_counts [i, 1])) +
", matching count = " + space_pct + pct + "% (" +
as.integer (as.scalar (matching_counts [i, 1])) + ")");
str = append (str, "SPEC_TO_PRED," + cid + "," + as.scalar (prY_cids [i, 1]));
str = append (str, "SPEC_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1]));
str = append (str, "SPEC_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
str = append (str, "SPEC_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
}
[prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
get_best_assignments (t(spYprY_row_counts));
print (" ");
print ("Predicted Clusters versus Specified Categories:");
prY_cids = prY_cids + orig_min_prY - 1;
spY_cids = spY_cids + orig_min_spY - 1;
for (i in 1 : nrow (prY_cids))
{
cid = as.integer (as.scalar (prY_cids [i, 1]));
pct = as.scalar (rounded_percentages [i, 1]);
space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
print ("Cluster " + cid +
": best spec. categ. is " + as.integer (as.scalar (spY_cids [i, 1])) +
"; full count = " + as.integer (as.scalar (full_counts [i, 1])) +
", matching count = " + space_pct + pct + "% (" +
as.integer (as.scalar (matching_counts [i, 1])) + ")");
str = append (str, "PRED_TO_SPEC," + cid + "," + as.scalar (spY_cids [i, 1]));
str = append (str, "PRED_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1]));
str = append (str, "PRED_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
str = append (str, "PRED_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
}
print (" ");
}}}
if ((fileO != " ") & (! is_str_empty)) {
write (str, fileO);
}
print ("DONE: K-MEANS SCORING SCRIPT");
get_best_assignments = function (Matrix[double] counts)
return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins,
Matrix[double] max_counts, Matrix[double] rounded_percentages)
{
margins = rowSums (counts);
select_positive = diag (margins > 0);
select_positive = removeEmpty (target = select_positive, margin = "rows");
row_ids = select_positive %*% seq (1, nrow (margins), 1);
pos_counts = select_positive %*% counts;
pos_margins = select_positive %*% margins;
max_counts = rowMaxs (pos_counts);
one_per_column = matrix (1, rows = 1, cols = ncol (pos_counts));
max_counts_ppred = max_counts %*% one_per_column;
is_max_count = (pos_counts == max_counts_ppred);
aggr_is_max_count = t(cumsum (t(is_max_count)));
col_ids = rowSums (aggr_is_max_count == 0) + 1;
rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
}