scripts.algorithms.Kmeans.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Implements the k-Means clustering algorithm
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# X String --- Location to read matrix X with the input data records
# k Int --- Number of centroids
# runs Int 10 Number of runs (with different initial centroids)
# maxi Int 1000 Maximum number of iterations per run
# tol Double 0.000001 Tolerance (epsilon) for WCSS change ratio
# samp Int 50 Average number of records per centroid in data samples
# C String "C.mtx" Location to store the output matrix with the centroids
# isY Boolean FALSE do not write Y
# Y String "Y.mtx" Location to store the mapping of records to centroids
# fmt String "text" Matrix output format, usually "text" or "csv"
# verb Boolean FALSE do not print per-iteration stats
# ----------------------------------------------------------------------------
#
# Example:
# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=TRUE Y=clusters.mtx verb=TRUE
fileX = $X;
fileY = ifdef ($Y, "Y.mtx");
fileC = ifdef ($C, "C.mtx");
num_centroids = $k;
num_runs = ifdef ($runs, 10); # $runs=10;
max_iter = ifdef ($maxi, 1000); # $maxi=1000;
eps = ifdef ($tol, 0.000001); # $tol=0.000001;
is_write_Y = ifdef ($isY, FALSE); # $isY=FALSE;
is_verbose = ifdef ($verb, FALSE); # $verb=FALSE;
fmtCY = ifdef ($fmt, "text"); # $fmt="text";
avg_sample_size_per_centroid = ifdef ($samp, 50); # $samp=50;
print ("BEGIN K-MEANS SCRIPT");
print ("Reading X...");
# X : matrix of data points as rows
X = read (fileX);
num_records = nrow (X);
num_features = ncol (X);
sumXsq = sum (X ^ 2);
# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))
# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:
print ("Taking data samples for initialization...");
[sample_maps, samples_vs_runs_map, sample_block_size] =
get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid);
is_row_in_samples = rowSums (sample_maps);
X_samples = sample_maps %*% X;
X_samples_sq_norms = rowSums (X_samples ^ 2);
print ("Initializing the centroids for all runs...");
All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features);
# We select centroids according to the k-Means++ heuristic applied to a sample of X
# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids,
# with the out-of-range X_sample positions in min_distances set to 0.0
min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random
for (i in 1 : num_centroids)
{
# "Matricize" and prefix-sum to compute the cumulative distribution function:
min_distances_matrix_form =
matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE);
cdf_min_distances = cumsum (min_distances_matrix_form);
# Select the i-th centroid in each sample as a random sample row id with
# probability ~ min_distances:
random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);
threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;
# Place the selected centroids together, one per run, into a matrix:
centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs));
centroid_placer_raw =
table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids);
centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
centroids = centroid_placer %*% X_samples;
# Place the selected centroids into their appropriate slots in All_Centroids:
centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
centroid_placer_raw =
table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1));
centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
All_Centroids = All_Centroids + centroid_placer %*% centroids;
# Update min_distances to preserve the loop invariant:
distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2)
- 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
if (i == 1) {
min_distances = is_row_in_samples * distances;
} else {
min_distances = min (min_distances, distances);
} }
# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:
termination_code = matrix (0, rows = num_runs, cols = 1);
final_wcss = matrix (0, rows = num_runs, cols = 1);
num_iterations = matrix (0, rows = num_runs, cols = 1);
print ("Performing k-means iterations for all runs...");
parfor (run_index in 1 : num_runs, check = 0)
{
C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ];
C_old = C;
iter_count = 0;
term_code = 0;
wcss = 0;
while (term_code == 0)
{
# Compute Euclidean squared distances from records (X rows) to centroids (C rows)
# without the C-independent term, then take the minimum for each record
D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
minD = rowMins (D);
# Compute the current centroid-based within-cluster sum of squares (WCSS)
wcss_old = wcss;
wcss = sumXsq + sum (minD);
if (is_verbose == TRUE) {
if (iter_count == 0) {
print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss);
} else {
print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss
+ "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids));
} }
# Check if convergence or maximum iteration has been reached
if (wcss_old - wcss < eps * wcss & iter_count > 0) {
term_code = 1; # Convergence is reached
} else {
if (iter_count >= max_iter) {
term_code = 2; # Maximum iteration is reached
} else {
iter_count = iter_count + 1;
# Find the closest centroid for each record
P = (D <= minD);
# If some records belong to multiple centroids, share them equally
P = P / rowSums (P);
# Compute the column normalization factor for P
P_denom = colSums (P);
if (sum (P_denom <= 0) > 0) {
term_code = 3; # There is a "runaway" centroid with 0.0 denominator
} else {
C_old = C;
# Compute new centroids as weighted averages over the records
C = (t(P) %*% X) / t(P_denom);
} } } }
print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss);
All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C;
final_wcss [run_index, 1] = wcss;
termination_code [run_index, 1] = term_code;
num_iterations [run_index, 1] = iter_count;
}
# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:
termination_bitmap = matrix (0, rows = num_runs, cols = 3);
termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw;
termination_stats = colSums (termination_bitmap);
print ("Number of successful runs = " + as.integer (as.scalar (termination_stats [1, 1])));
print ("Number of incomplete runs = " + as.integer (as.scalar (termination_stats [1, 2])));
print ("Number of failed runs (with lost centroids) = " + as.integer (as.scalar (termination_stats [1, 3])));
num_successful_runs = as.scalar (termination_stats [1, 1]);
if (num_successful_runs > 0) {
final_wcss_successful = final_wcss * termination_bitmap [, 1];
worst_wcss = max (final_wcss_successful);
best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1]));
avg_wcss = sum (final_wcss_successful) / num_successful_runs;
best_index_vector = (final_wcss_successful == best_wcss);
aggr_best_index_vector = cumsum (best_index_vector);
best_index = as.integer (sum (aggr_best_index_vector == 0) + 1);
print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss
+ "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss);
C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
print ("Writing out the best-WCSS centroids...");
write (C, fileC, format=fmtCY);
if (is_write_Y == TRUE) {
print ("Writing out the best-WCSS cluster labels...");
D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
P = (D <= rowMins (D));
aggr_P = t(cumsum (t(P)));
Y = rowSums (aggr_P == 0) + 1
write (Y, fileY, format=fmtCY);
}
print ("DONE.");
} else {
stop ("No output is produced. Try increasing the number of iterations and/or runs.");
}
get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
{
if (approx_sample_size < num_records) {
# Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's
# to get the sample block size (to allocate space):
sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size)));
num_rows = sample_block_size * num_samples;
# Generate all samples in parallel by converting uniform random values into random
# integer skip-ahead intervals and prefix-summing them:
sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
# Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one)
# Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1":
is_sample_rec_id_within_range = (sample_rec_ids <= num_records);
sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range
+ (num_records + 1) * (1 - is_sample_rec_id_within_range);
# Rearrange all samples (and their out-of-range indicators) into one column-vector:
sample_rec_ids =
matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
is_row_in_samples =
matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE);
# Use contingency table to create the "sample_maps" matrix that is a vertical concatenation
# of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere:
sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
max_rec_id = ncol (sample_maps_raw);
if (max_rec_id >= num_records) {
sample_maps = sample_maps_raw [, 1 : num_records];
} else {
sample_maps = matrix (0, rows = num_rows, cols = num_records);
sample_maps [, 1 : max_rec_id] = sample_maps_raw;
}
# Create a 0-1-matrix that maps each sample column ID into all row positions of the
# corresponding sample; map out-of-sample-range positions to row id = num_rows + 1:
sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1);
# Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c:
col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size);
sample_col_map = table (sample_positions, col_positions);
# Remove the out-of-sample-range positions by cutting off the last row:
sample_col_map = sample_col_map [1 : (num_rows), ];
} else {
one_per_record = matrix (1, rows = num_records, cols = 1);
sample_block_size = num_records;
sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records);
sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples);
for (i in 1:num_samples) {
sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record);
sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record;
} } }