All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.neo4j.gds.embeddings.hashgnn.BinarizeTask Maven / Gradle / Ivy

/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [http://neo4j.com]
 *
 * This file is part of Neo4j.
 *
 * Neo4j is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
package org.neo4j.gds.embeddings.hashgnn;

import org.apache.commons.lang3.mutable.MutableLong;
import org.neo4j.gds.api.Graph;
import org.neo4j.gds.collections.ha.HugeObjectArray;
import org.neo4j.gds.core.concurrency.Concurrency;
import org.neo4j.gds.core.concurrency.RunWithConcurrency;
import org.neo4j.gds.core.utils.paged.HugeAtomicBitSet;
import org.neo4j.gds.core.utils.partition.Partition;
import org.neo4j.gds.core.utils.progress.tasks.ProgressTracker;
import org.neo4j.gds.ml.core.features.FeatureConsumer;
import org.neo4j.gds.ml.core.features.FeatureExtraction;
import org.neo4j.gds.ml.core.features.FeatureExtractor;
import org.neo4j.gds.termination.TerminationFlag;

import java.util.List;
import java.util.SplittableRandom;
import java.util.stream.Collectors;

import static org.neo4j.gds.utils.StringFormatting.formatWithLocale;

class BinarizeTask implements Runnable {
    private final Partition partition;
    private final HugeObjectArray truncatedFeatures;
    private final List featureExtractors;
    private final double[][] propertyEmbeddings;

    private final double threshold;
    private final int dimension;
    private final ProgressTracker progressTracker;
    private long totalFeatureCount;

    private double scalarProductSum;

    private double scalarProductSumOfSquares;

    BinarizeTask(
        Partition partition,
        BinarizeFeaturesConfig config,
        HugeObjectArray truncatedFeatures,
        List featureExtractors,
        double[][] propertyEmbeddings,
        ProgressTracker progressTracker
    ) {
        this.partition = partition;
        this.dimension = config.dimension();
        this.threshold = config.threshold();
        this.truncatedFeatures = truncatedFeatures;
        this.featureExtractors = featureExtractors;
        this.propertyEmbeddings = propertyEmbeddings;
        this.progressTracker = progressTracker;
    }

    static HugeObjectArray compute(
        Graph graph,
        List partition,
        Concurrency concurrency,
        List featureProperties,
        BinarizeFeaturesConfig binarizationConfig,
        SplittableRandom rng,
        ProgressTracker progressTracker,
        TerminationFlag terminationFlag,
        MutableLong totalFeatureCountOutput
    ) {
        progressTracker.beginSubTask("Binarize node property features");

        var featureExtractors = FeatureExtraction.propertyExtractors(graph, featureProperties);

        var inputDimension = FeatureExtraction.featureCount(featureExtractors);
        var propertyEmbeddings = embedProperties(binarizationConfig.dimension(), rng, inputDimension);

        var truncatedFeatures = HugeObjectArray.newArray(HugeAtomicBitSet.class, graph.nodeCount());

        var tasks = partition.stream()
            .map(p -> new BinarizeTask(
                p,
                binarizationConfig,
                truncatedFeatures,
                featureExtractors,
                propertyEmbeddings,
                progressTracker
            ))
            .collect(Collectors.toList());
        RunWithConcurrency.builder()
            .concurrency(concurrency)
            .tasks(tasks)
            .terminationFlag(terminationFlag)
            .run();

        totalFeatureCountOutput.add(tasks.stream().mapToLong(BinarizeTask::totalFeatureCount).sum());

        var squaredSum = tasks.stream().mapToDouble(BinarizeTask::scalarProductSumOfSquares).sum();
        var sum = tasks.stream().mapToDouble(BinarizeTask::scalarProductSum).sum();
        long exampleCount = graph.nodeCount() * binarizationConfig.dimension();
        var avg = sum / exampleCount;

        var variance = (squaredSum - exampleCount * avg * avg) / exampleCount;
        var std = Math.sqrt(variance);

        progressTracker.logInfo(formatWithLocale(
            "Hyperplane scalar products have mean %.4f and standard deviation %.4f. A threshold for binarization may be set to the mean plus a few standard deviations.",
            avg,
            std
        ));

        progressTracker.endSubTask("Binarize node property features");

        return truncatedFeatures;
    }

    // creates a random projection vector for each feature
    // (input features vector for each node is the concatenation of the node's properties)
    // this array is used embed the properties themselves from inputDimension to embeddingDimension dimensions.
    private static double[][] embedProperties(int vectorDimension, SplittableRandom rng, int inputDimension) {
        var propertyEmbeddings = new double[inputDimension][];

        for (int inputFeature = 0; inputFeature < inputDimension; inputFeature++) {
            propertyEmbeddings[inputFeature] = new double[vectorDimension];
            for (int feature = 0; feature < vectorDimension; feature++) {
                propertyEmbeddings[inputFeature][feature] = boxMullerGaussianRandom(rng);
            }
        }
        return propertyEmbeddings;
    }

    private static double boxMullerGaussianRandom(SplittableRandom rng) {
        return Math.sqrt(-2 * Math.log(rng.nextDouble(
            0.0,
            1.0
        ))) * Math.cos(2 * Math.PI * rng.nextDouble(0.0, 1.0));
    }

    @Override
    public void run() {
        partition.consume(nodeId -> {
            var featureVector = new float[dimension];
            FeatureExtraction.extract(nodeId, -1, featureExtractors, new FeatureConsumer() {
                @Override
                public void acceptScalar(long nodeOffset, int offset, double value) {
                    for (int feature = 0; feature < dimension; feature++) {
                        double featureValue = propertyEmbeddings[offset][feature];
                        featureVector[feature] += value * featureValue;
                    }
                }

                @Override
                public void acceptArray(long nodeOffset, int offset, double[] values) {
                    for (int inputFeatureOffset = 0; inputFeatureOffset < values.length; inputFeatureOffset++) {
                        double value = values[inputFeatureOffset];
                        for (int feature = 0; feature < dimension; feature++) {
                            double featureValue = propertyEmbeddings[offset + inputFeatureOffset][feature];
                            featureVector[feature] += value * featureValue;
                        }
                    }
                }
            });

            var featureSet = round(featureVector);
            totalFeatureCount += featureSet.cardinality();
            truncatedFeatures.set(nodeId, featureSet);
        });

        progressTracker.logProgress(partition.nodeCount());
    }

    private HugeAtomicBitSet round(float[] floatVector) {
        var bitset = HugeAtomicBitSet.create(floatVector.length);
        for (int feature = 0; feature < floatVector.length; feature++) {
            var scalarProduct = floatVector[feature];
            scalarProductSum += scalarProduct;
            scalarProductSumOfSquares += scalarProduct * scalarProduct;
            if (scalarProduct > threshold) {
                bitset.set(feature);
            }
        }
        return bitset;
    }

    private long totalFeatureCount() {
        return totalFeatureCount;
    }

    private double scalarProductSum() {
        return scalarProductSum;
    }

    private double scalarProductSumOfSquares() {
        return scalarProductSumOfSquares;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy