org.apache.flink.graph.example.JaccardSimilarityMeasure Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.graph.example;
import org.apache.flink.api.common.ProgramDescription;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.graph.Edge;
import org.apache.flink.graph.EdgeDirection;
import org.apache.flink.graph.Graph;
import org.apache.flink.graph.ReduceNeighborsFunction;
import org.apache.flink.graph.Vertex;
import org.apache.flink.graph.Triplet;
import org.apache.flink.graph.VertexJoinFunction;
import org.apache.flink.graph.example.utils.JaccardSimilarityMeasureData;
import java.util.HashSet;
/**
* This example shows how to use
*
* - neighborhood methods
*
- join with vertices
*
- triplets
*
*
* Given a directed, unweighted graph, return a weighted graph where the edge values are equal
* to the Jaccard similarity coefficient - the number of common neighbors divided by the the size
* of the union of neighbor sets - for the src and target vertices.
*
*
* Input files are plain text files and must be formatted as follows:
*
* Edges are represented by pairs of srcVertexId, trgVertexId separated by tabs.
* Edges themselves are separated by newlines.
* For example: 1 2\n1 3\n
defines two edges 1-2 and 1-3.
*
*
* Usage JaccardSimilarityMeasure <edge path> <result path>
* If no parameters are provided, the program is run with default data from
* {@link org.apache.flink.graph.example.utils.JaccardSimilarityMeasureData}
*/
@SuppressWarnings("serial")
public class JaccardSimilarityMeasure implements ProgramDescription {
public static void main(String [] args) throws Exception {
if(!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet> edges = getEdgesDataSet(env);
Graph, Double> graph = Graph.fromDataSet(edges,
new MapFunction>() {
@Override
public HashSet map(Long id) throws Exception {
HashSet neighbors = new HashSet();
neighbors.add(id);
return new HashSet(neighbors);
}
}, env);
// create the set of neighbors
DataSet>> computedNeighbors =
graph.reduceOnNeighbors(new GatherNeighbors(), EdgeDirection.ALL);
// join with the vertices to update the node values
Graph, Double> graphWithVertexValues =
graph.joinWithVertices(computedNeighbors, new VertexJoinFunction,
HashSet>() {
public HashSet vertexJoin(HashSet vertexValue, HashSet inputValue) {
return inputValue;
}
});
// compare neighbors, compute Jaccard
DataSet> edgesWithJaccardValues =
graphWithVertexValues.getTriplets().map(new ComputeJaccard());
// emit result
if (fileOutput) {
edgesWithJaccardValues.writeAsCsv(outputPath, "\n", ",");
// since file sinks are lazy, we trigger the execution explicitly
env.execute("Executing Jaccard Similarity Measure");
} else {
edgesWithJaccardValues.print();
}
}
@Override
public String getDescription() {
return "Vertex Jaccard Similarity Measure";
}
/**
* Each vertex will have a HashSet containing its neighbor ids as value.
*/
private static final class GatherNeighbors implements ReduceNeighborsFunction> {
@Override
public HashSet reduceNeighbors(HashSet first, HashSet second) {
first.addAll(second);
return new HashSet(first);
}
}
/**
* The edge weight will be the Jaccard coefficient, which is computed as follows:
*
* Consider the edge x-y
* We denote by sizeX and sizeY, the neighbors hash set size of x and y respectively.
* sizeX+sizeY = union + intersection of neighborhoods
* size(hashSetX.addAll(hashSetY)).distinct = union of neighborhoods
* The intersection can then be deduced.
*
* The Jaccard similarity coefficient is then, the intersection/union.
*/
private static final class ComputeJaccard implements
MapFunction, Double>, Edge> {
@Override
public Edge map(Triplet, Double> triplet) throws Exception {
Vertex> srcVertex = triplet.getSrcVertex();
Vertex> trgVertex = triplet.getTrgVertex();
Long x = srcVertex.getId();
Long y = trgVertex.getId();
HashSet neighborSetY = trgVertex.getValue();
double unionPlusIntersection = srcVertex.getValue().size() + neighborSetY.size();
// within a HashSet, all elements are distinct
HashSet unionSet = new HashSet();
unionSet.addAll(srcVertex.getValue());
unionSet.addAll(neighborSetY);
double union = unionSet.size();
double intersection = unionPlusIntersection - union;
return new Edge(x, y, intersection/union);
}
}
// *************************************************************************
// UTIL METHODS
// *************************************************************************
private static boolean fileOutput = false;
private static String edgeInputPath = null;
private static String outputPath = null;
private static boolean parseParameters(String [] args) {
if(args.length > 0) {
if(args.length != 2) {
System.err.println("Usage JaccardSimilarityMeasure
© 2015 - 2025 Weber Informatics LLC | Privacy Policy