org.neo4j.gds.kmeans.KmeansPlusPlusSampler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of algo Show documentation
Neo4j Graph Data Science :: Algorithms
The newest version!
/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [http://neo4j.com]
 *
 * This file is part of Neo4j.
 *
 * Neo4j is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
package org.neo4j.gds.kmeans;


import com.carrotsearch.hppc.BitSet;
import org.neo4j.gds.collections.ha.HugeDoubleArray;
import org.neo4j.gds.core.concurrency.Concurrency;
import org.neo4j.gds.core.concurrency.RunWithConcurrency;
import org.neo4j.gds.core.utils.progress.tasks.ProgressTracker;

import java.util.List;
import java.util.SplittableRandom;
import java.util.concurrent.ExecutorService;

public class KmeansPlusPlusSampler extends KmeansSampler {

    private final List tasks;
    private final Concurrency concurrency;
    private final ProgressTracker progressTracker;
    private final HugeDoubleArray distanceFromClosestCentroid;
    private final ExecutorService executorService;


    public KmeansPlusPlusSampler(
        SplittableRandom random,
        ClusterManager clusterManager,
        long nodeCount,
        int k,
        HugeDoubleArray distanceFromClosestCentroid,
        Concurrency concurrency,
        ExecutorService executorService,
        List tasks,
        ProgressTracker progressTracker
    ) {
        super(random, clusterManager, nodeCount, k);
        this.distanceFromClosestCentroid = distanceFromClosestCentroid;
        this.executorService = executorService;
        this.tasks = tasks;
        this.concurrency = concurrency;
        this.progressTracker = progressTracker;
    }


    @Override
    public void performInitialSampling() {
        long firstId = random.nextLong(nodeCount);

        BitSet bitSet = new BitSet(nodeCount);
        assignToCluster(bitSet, 0, firstId);
        progressTracker.logProgress(1);
        for (int selectionClusterId = 1; selectionClusterId < k; ++selectionClusterId) {

            RunWithConcurrency.builder()
                .concurrency(concurrency)
                .tasks(tasks)
                .executor(executorService)
                .run();

            double squaredDistance = 0;
            for (KmeansTask task : tasks) {
                squaredDistance += task.getSquaredDistance();
            }
            long nextNode = -1;

            //This is fail-case in case of overflow
            if (!(Double.isInfinite(squaredDistance) || squaredDistance <= 0)) {

                double x = random.nextDouble() * squaredDistance;
                double curr = 0;

                for (long nodeId = 0; nodeId < nodeCount; nodeId++) {
                    double distanceFromCentroid = distanceFromClosestCentroid.get(nodeId);
                    if (distanceFromCentroid <= -1) {
                        continue;
                    }
                    curr += distanceFromCentroid * distanceFromCentroid;

                    if (x <= curr) {
                        nextNode = nodeId;
                        break;
                    }

                }
            }
            if (nextNode == -1) {
                nextNode = random.nextLong(nodeCount);
                while (bitSet.get(nextNode)) {
                    nextNode = random.nextLong(nodeCount);
                }
            }
            assignToCluster(bitSet, selectionClusterId, nextNode);
            progressTracker.logProgress(1);
        }
        //nowe we have k clusters and distanceFromClusterAlso for each node closest communit in 0...k-2
        RunWithConcurrency.builder()  //now run one last time just to save  have the vest community in 0...k-1
            .concurrency(concurrency)
            .tasks(tasks)
            .executor(executorService)
            .run();

        for (KmeansTask task : tasks) {
            task.switchToPhase(TaskPhase.ITERATION);
        }
    }

    private void assignToCluster(BitSet bitSet, int position, long nextNode) {
        bitSet.set(nextNode);
        clusterManager.initialAssignCluster(nextNode);
        distanceFromClosestCentroid.set(nextNode, -(position + 1));
    }
}