
org.apache.flink.graph.library.Summarization Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.graph.library;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.functions.FunctionAnnotation;
import org.apache.flink.api.java.operators.GroupReduceOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.graph.Edge;
import org.apache.flink.graph.Graph;
import org.apache.flink.graph.GraphAlgorithm;
import org.apache.flink.graph.Vertex;
import org.apache.flink.util.Collector;
/**
* The summarization algorithm computes a condensed version of the input graph
* by grouping vertices and edges based on their values. By doing this, the
* algorithm helps to uncover insights about patterns and distributions in the
* graph.
*
* In the resulting graph, each vertex represents a group of vertices that share the
* same vertex value. An edge, that connects a vertex with itself, represents all edges
* with the same edge value that connect vertices inside that group. An edge between
* vertices in the output graph represents all edges with the same edge value between
* members of those groups in the input graph.
*
* Consider the following example:
*
* Input graph:
*
* Vertices (id, value):
* (0, "A")
* (1, "A")
* (2, "B")
* (3, "B")
*
* Edges (source, target, value):
* (0,1, null)
* (1,0, null)
* (1,2, null)
* (2,1, null)
* (2,3, null)
* (3,2, null)
*
* Output graph:
*
* Vertices (id, (value, count)):
* (0, ("A", 2)) // 0 and 1
* (2, ("B", 2)) // 2 and 3
*
* Edges (source, target, (value, count)):
* (0, 0, (null, 2)) // (0,1) and (1,0)
* (2, 2, (null, 2)) // (2,3) and (3,2)
* (0, 2, (null, 1)) // (1,2)
* (2, 0, (null, 1)) // (2,1)
*
* Note that this implementation is non-deterministic in the way that it assigns
* identifiers to summarized vertices. However, it is guaranteed that the identifier
* is one of the represented vertex identifiers.
*
* @param vertex identifier type
* @param vertex value type
* @param edge value type
*/
public class Summarization
implements GraphAlgorithm, Summarization.EdgeValue>> {
@Override
public Graph, EdgeValue> run(Graph input) throws Exception {
// -------------------------
// build summarized vertices
// -------------------------
// group vertices by value
UnsortedGrouping> vertexUnsortedGrouping = input.getVertices()
.groupBy(1);
// reduce vertex group and create vertex group items
GroupReduceOperator, VertexGroupItem> vertexGroupItems = vertexUnsortedGrouping
.reduceGroup(new VertexGroupReducer());
// create summarized vertices
DataSet>> summarizedVertices = vertexGroupItems
.filter(new VertexGroupItemToSummarizedVertexFilter())
.map(new VertexGroupItemToSummarizedVertexMapper());
// create mapping between vertices and their representative
DataSet> vertexToRepresentativeMap = vertexGroupItems
.filter(new VertexGroupItemToRepresentativeFilter())
.map(new VertexGroupItemToVertexWithRepresentativeMapper());
// -------------------------
// build summarized edges
// -------------------------
// join edges with vertex representatives and update source and target identifiers
DataSet> edgesForGrouping = input.getEdges()
.join(vertexToRepresentativeMap)
.where(0) // source vertex id
.equalTo(0) // vertex id
.with(new SourceVertexJoinFunction())
.join(vertexToRepresentativeMap)
.where(1) // target vertex id
.equalTo(0) // vertex id
.with(new TargetVertexJoinFunction());
// create summarized edges
DataSet>> summarizedEdges = edgesForGrouping
.groupBy(0, 1, 2) // group by source id (0), target id (1) and edge value (2)
.reduceGroup(new EdgeGroupReducer());
return Graph.fromDataSet(summarizedVertices, summarizedEdges, input.getContext());
}
// --------------------------------------------------------------------------------------------
// Tuple Types
// --------------------------------------------------------------------------------------------
/**
* Value that is stored at a summarized vertex.
*
* f0: vertex group value
* f1: vertex group count
*
* @param vertex value type
*/
@SuppressWarnings("serial")
public static final class VertexValue extends Tuple2 {
public VV getVertexGroupValue() {
return f0;
}
public void setVertexGroupValue(VV vertexGroupValue) {
f0 = vertexGroupValue;
}
public Long getVertexGroupCount() {
return f1;
}
public void setVertexGroupCount(Long vertexGroupCount) {
f1 = vertexGroupCount;
}
}
/**
* Value that is stored at a summarized edge.
*
* f0: edge group value
* f1: edge group count
*
* @param edge value type
*/
@SuppressWarnings("serial")
public static final class EdgeValue extends Tuple2 {
public EV getEdgeGroupValue() {
return f0;
}
public void setEdgeGroupValue(EV edgeGroupValue) {
f0 = edgeGroupValue;
}
public Long getEdgeGroupCount() {
return f1;
}
public void setEdgeGroupCount(Long edgeGroupCount) {
f1 = edgeGroupCount;
}
}
/**
* Represents a single vertex in a vertex group.
*
* f0: vertex identifier
* f1: vertex group representative identifier
* f2: vertex group value
* f3: vertex group count
*
* @param vertex identifier type
* @param vertex group value type
*/
@SuppressWarnings("serial")
public static final class VertexGroupItem extends Tuple4 {
public VertexGroupItem() {
setVertexGroupCount(0L);
}
public K getVertexId() {
return f0;
}
public void setVertexId(K vertexId) {
f0 = vertexId;
}
public K getGroupRepresentativeId() {
return f1;
}
public void setGroupRepresentativeId(K groupRepresentativeId) {
f1 = groupRepresentativeId;
}
public VGV getVertexGroupValue() {
return f2;
}
public void setVertexGroupValue(VGV vertexGroupValue) {
f2 = vertexGroupValue;
}
public Long getVertexGroupCount() {
return f3;
}
public void setVertexGroupCount(Long vertexGroupCount) {
f3 = vertexGroupCount;
}
/**
* Resets the fields to initial values. This is necessary if the tuples are reused and not all fields were modified.
*/
public void reset() {
f0 = null;
f1 = null;
f2 = null;
f3 = 0L;
}
}
/**
* Represents a vertex identifier and its corresponding vertex group identifier.
*
* @param vertex identifier type
*/
@SuppressWarnings("serial")
public static final class VertexWithRepresentative extends Tuple2 {
public void setVertexId(K vertexId) {
f0 = vertexId;
}
public K getGroupRepresentativeId() {
return f1;
}
public void setGroupRepresentativeId(K groupRepresentativeId) {
f1 = groupRepresentativeId;
}
}
// --------------------------------------------------------------------------------------------
// Functions
// --------------------------------------------------------------------------------------------
/**
* Creates one {@link VertexGroupItem} for each group element containing the vertex identifier and the identifier
* of the group representative which is the first vertex in the reduce input iterable.
*
* Creates one {@link VertexGroupItem} representing the whole group that contains the vertex identifier of the
* group representative, the vertex group value and the total number of group elements.
*
* @param vertex identifier type
* @param vertex value type
*/
@SuppressWarnings("serial")
private static final class VertexGroupReducer
implements GroupReduceFunction, VertexGroupItem> {
private final VertexGroupItem reuseVertexGroupItem;
private VertexGroupReducer() {
this.reuseVertexGroupItem = new VertexGroupItem<>();
}
@Override
public void reduce(Iterable> values, Collector> out) throws Exception {
K vertexGroupRepresentativeID = null;
long vertexGroupCount = 0L;
VV vertexGroupValue = null;
boolean isFirstElement = true;
for (Vertex vertex : values) {
if (isFirstElement) {
// take final group representative vertex id from first tuple
vertexGroupRepresentativeID = vertex.getId();
vertexGroupValue = vertex.getValue();
isFirstElement = false;
}
// no need to set group value for those tuples
reuseVertexGroupItem.setVertexId(vertex.getId());
reuseVertexGroupItem.setGroupRepresentativeId(vertexGroupRepresentativeID);
out.collect(reuseVertexGroupItem);
vertexGroupCount++;
}
createGroupRepresentativeTuple(vertexGroupRepresentativeID, vertexGroupValue, vertexGroupCount);
out.collect(reuseVertexGroupItem);
reuseVertexGroupItem.reset();
}
/**
* Creates one tuple representing the whole group. This tuple is later used to create a summarized vertex for each
* group.
*
* @param vertexGroupRepresentativeId group representative vertex identifier
* @param vertexGroupValue group property value
* @param vertexGroupCount total group count
*/
private void createGroupRepresentativeTuple(K vertexGroupRepresentativeId,
VV vertexGroupValue,
Long vertexGroupCount) {
reuseVertexGroupItem.setVertexId(vertexGroupRepresentativeId);
reuseVertexGroupItem.setVertexGroupValue(vertexGroupValue);
reuseVertexGroupItem.setVertexGroupCount(vertexGroupCount);
}
}
/**
* Creates a summarized edge from a group of edges. Counts the number of elements in the group.
*
* @param vertex identifier type
* @param edge group value type
*/
@SuppressWarnings("serial")
private static final class EdgeGroupReducer
implements GroupReduceFunction, Edge>> {
private final Edge> reuseEdge;
private final EdgeValue reuseEdgeValue;
private EdgeGroupReducer() {
reuseEdge = new Edge<>();
reuseEdgeValue = new EdgeValue<>();
}
@Override
public void reduce(Iterable> values, Collector>> out) throws Exception {
K sourceVertexId = null;
K targetVertexId = null;
EV edgeGroupValue = null;
Long edgeGroupCount = 0L;
boolean isFirstElement = true;
for (Edge edge : values) {
if (isFirstElement) {
sourceVertexId = edge.getSource();
targetVertexId = edge.getTarget();
edgeGroupValue = edge.getValue();
isFirstElement = false;
}
edgeGroupCount++;
}
reuseEdgeValue.setEdgeGroupValue(edgeGroupValue);
reuseEdgeValue.setEdgeGroupCount(edgeGroupCount);
reuseEdge.setSource(sourceVertexId);
reuseEdge.setTarget(targetVertexId);
reuseEdge.setValue(reuseEdgeValue);
out.collect(reuseEdge);
}
}
/**
* Filter tuples that are representing a vertex group. They are used to create new summarized vertices and have a
* group count greater than zero.
*
* @param vertex identifier type
* @param vertex value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFields("*->*")
private static final class VertexGroupItemToSummarizedVertexFilter
implements FilterFunction> {
@Override
public boolean filter(VertexGroupItem vertexGroupItem) throws Exception {
return !vertexGroupItem.getVertexGroupCount().equals(0L);
}
}
/**
* Filter tuples that are representing a single vertex. They are used to update the source and target vertex
* identifiers at the edges.
*
* @param vertex identifier type
* @param vertex value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFields("*->*")
private static final class VertexGroupItemToRepresentativeFilter
implements FilterFunction> {
@Override
public boolean filter(VertexGroupItem vertexGroupItem) throws Exception {
return vertexGroupItem.getVertexGroupCount().equals(0L);
}
}
/**
* Creates a new vertex representing a vertex group. The vertex stores the group value and the number of vertices in
* the group.
*
* @param vertex identifier type
* @param vertex value type
*/
@SuppressWarnings("serial")
private static final class VertexGroupItemToSummarizedVertexMapper
implements MapFunction, Vertex>> {
private final VertexValue reuseSummarizedVertexValue;
private VertexGroupItemToSummarizedVertexMapper() {
reuseSummarizedVertexValue = new VertexValue<>();
}
@Override
public Vertex> map(VertexGroupItem value) throws Exception {
K vertexId = value.getVertexId();
reuseSummarizedVertexValue.setVertexGroupValue(value.getVertexGroupValue());
reuseSummarizedVertexValue.setVertexGroupCount(value.getVertexGroupCount());
return new Vertex<>(vertexId, reuseSummarizedVertexValue);
}
}
/**
* Creates a {@link VertexWithRepresentative} from a {@link VertexGroupItem}.
*
* @param vertex identifier type
* @param vertex value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFields("f0;f1")
private static final class VertexGroupItemToVertexWithRepresentativeMapper
implements MapFunction, VertexWithRepresentative> {
private final VertexWithRepresentative reuseVertexWithRepresentative;
private VertexGroupItemToVertexWithRepresentativeMapper() {
reuseVertexWithRepresentative = new VertexWithRepresentative<>();
}
@Override
public VertexWithRepresentative map(VertexGroupItem vertexGroupItem) throws Exception {
reuseVertexWithRepresentative.setVertexId(vertexGroupItem.getVertexId());
reuseVertexWithRepresentative.setGroupRepresentativeId(vertexGroupItem.getGroupRepresentativeId());
return reuseVertexWithRepresentative;
}
}
/**
* Replaces the source vertex id with the vertex group representative id and adds the edge group value.
*
* @param vertex identifier type
* @param edge value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFieldsFirst("f1") // edge target id
@FunctionAnnotation.ForwardedFieldsSecond("f1->f0") // vertex group id -> edge source id
private static final class SourceVertexJoinFunction
implements JoinFunction, VertexWithRepresentative, Edge> {
private final Edge reuseEdge;
private SourceVertexJoinFunction() {
this.reuseEdge = new Edge<>();
}
@Override
public Edge join(Edge edge, VertexWithRepresentative vertex) throws Exception {
reuseEdge.setSource(vertex.getGroupRepresentativeId());
reuseEdge.setTarget(edge.getTarget());
reuseEdge.setValue(edge.getValue());
return reuseEdge;
}
}
/**
* Replaces the target vertex id with the vertex group identifier.
*
* @param vertex identifier type
* @param edge group value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFieldsFirst("f0;f2") // source vertex id, edge group value
@FunctionAnnotation.ForwardedFieldsSecond("f1") // vertex group id -> edge target id
private static final class TargetVertexJoinFunction
implements JoinFunction, VertexWithRepresentative, Edge> {
@Override
public Edge join(Edge edge,
VertexWithRepresentative vertexRepresentative) throws Exception {
edge.setTarget(vertexRepresentative.getGroupRepresentativeId());
return edge;
}
}
}