All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.graph.example.JaccardSimilarityMeasure Maven / Gradle / Ivy

There is a newer version: 1.16.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.graph.example;

import org.apache.flink.api.common.ProgramDescription;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.graph.Edge;
import org.apache.flink.graph.EdgeDirection;
import org.apache.flink.graph.Graph;
import org.apache.flink.graph.ReduceNeighborsFunction;
import org.apache.flink.graph.Vertex;
import org.apache.flink.graph.Triplet;
import org.apache.flink.graph.VertexJoinFunction;
import org.apache.flink.graph.example.utils.JaccardSimilarityMeasureData;

import java.util.HashSet;

/**
 * This example shows how to use
 * 
    *
  • neighborhood methods *
  • join with vertices *
  • triplets *
* * Given a directed, unweighted graph, return a weighted graph where the edge values are equal * to the Jaccard similarity coefficient - the number of common neighbors divided by the the size * of the union of neighbor sets - for the src and target vertices. * *

* Input files are plain text files and must be formatted as follows: *
* Edges are represented by pairs of srcVertexId, trgVertexId separated by tabs. * Edges themselves are separated by newlines. * For example: 1 2\n1 3\n defines two edges 1-2 and 1-3. *

* * Usage JaccardSimilarityMeasure <edge path> <result path>
* If no parameters are provided, the program is run with default data from * {@link org.apache.flink.graph.example.utils.JaccardSimilarityMeasureData} */ @SuppressWarnings("serial") public class JaccardSimilarityMeasure implements ProgramDescription { public static void main(String [] args) throws Exception { if(!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet> edges = getEdgesDataSet(env); Graph, Double> graph = Graph.fromDataSet(edges, new MapFunction>() { @Override public HashSet map(Long id) throws Exception { HashSet neighbors = new HashSet(); neighbors.add(id); return new HashSet(neighbors); } }, env); // create the set of neighbors DataSet>> computedNeighbors = graph.reduceOnNeighbors(new GatherNeighbors(), EdgeDirection.ALL); // join with the vertices to update the node values Graph, Double> graphWithVertexValues = graph.joinWithVertices(computedNeighbors, new VertexJoinFunction, HashSet>() { public HashSet vertexJoin(HashSet vertexValue, HashSet inputValue) { return inputValue; } }); // compare neighbors, compute Jaccard DataSet> edgesWithJaccardValues = graphWithVertexValues.getTriplets().map(new ComputeJaccard()); // emit result if (fileOutput) { edgesWithJaccardValues.writeAsCsv(outputPath, "\n", ","); // since file sinks are lazy, we trigger the execution explicitly env.execute("Executing Jaccard Similarity Measure"); } else { edgesWithJaccardValues.print(); } } @Override public String getDescription() { return "Vertex Jaccard Similarity Measure"; } /** * Each vertex will have a HashSet containing its neighbor ids as value. */ private static final class GatherNeighbors implements ReduceNeighborsFunction> { @Override public HashSet reduceNeighbors(HashSet first, HashSet second) { first.addAll(second); return new HashSet(first); } } /** * The edge weight will be the Jaccard coefficient, which is computed as follows: * * Consider the edge x-y * We denote by sizeX and sizeY, the neighbors hash set size of x and y respectively. * sizeX+sizeY = union + intersection of neighborhoods * size(hashSetX.addAll(hashSetY)).distinct = union of neighborhoods * The intersection can then be deduced. * * The Jaccard similarity coefficient is then, the intersection/union. */ private static final class ComputeJaccard implements MapFunction, Double>, Edge> { @Override public Edge map(Triplet, Double> triplet) throws Exception { Vertex> srcVertex = triplet.getSrcVertex(); Vertex> trgVertex = triplet.getTrgVertex(); Long x = srcVertex.getId(); Long y = trgVertex.getId(); HashSet neighborSetY = trgVertex.getValue(); double unionPlusIntersection = srcVertex.getValue().size() + neighborSetY.size(); // within a HashSet, all elements are distinct HashSet unionSet = new HashSet(); unionSet.addAll(srcVertex.getValue()); unionSet.addAll(neighborSetY); double union = unionSet.size(); double intersection = unionPlusIntersection - union; return new Edge(x, y, intersection/union); } } // ************************************************************************* // UTIL METHODS // ************************************************************************* private static boolean fileOutput = false; private static String edgeInputPath = null; private static String outputPath = null; private static boolean parseParameters(String [] args) { if(args.length > 0) { if(args.length != 2) { System.err.println("Usage JaccardSimilarityMeasure "); return false; } fileOutput = true; edgeInputPath = args[0]; outputPath = args[1]; } else { System.out.println("Executing JaccardSimilarityMeasure example with default parameters and built-in default data."); System.out.println("Provide parameters to read input data from files."); System.out.println("Usage JaccardSimilarityMeasure "); } return true; } private static DataSet> getEdgesDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(edgeInputPath) .ignoreComments("#") .fieldDelimiter("\t") .lineDelimiter("\n") .types(Long.class, Long.class) .map(new MapFunction, Edge>() { @Override public Edge map(Tuple2 tuple2) throws Exception { return new Edge(tuple2.f0, tuple2.f1, new Double(0)); } }); } else { return JaccardSimilarityMeasureData.getDefaultEdgeDataSet(env); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy