com.dataartisans.flinktraining.exercises.dataset_java.reply_graph.ReplyGraph Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-training-exercises Show documentation
Show all versions of flink-training-exercises Show documentation
Utilities and material for an Apache Flink Training provided by data Artisans.
The newest version!
/*
* Copyright 2015 data Artisans GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dataartisans.flinktraining.exercises.dataset_java.reply_graph;
import com.dataartisans.flinktraining.dataset_preparation.MBoxParser;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.util.Collector;
/**
* Java reference implementation for the "Reply Graph" exercise of the Flink training.
* The task of the exercise is to enumerate the reply connection between two email addresses in
* Flink's developer mailing list and count the number of connections between two email addresses.
*
* Required parameters:
* --input path-to-input-directory
*/
public class ReplyGraph {
public static void main(String[] args) throws Exception {
// parse parameters
ParameterTool params = ParameterTool.fromArgs(args);
String input = params.getRequired("input");
// obtain an execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// read messageId, sender, and reply-to fields from the input data set
DataSet> mails =
env.readCsvFile(input)
.lineDelimiter(MBoxParser.MAIL_RECORD_DELIM)
.fieldDelimiter(MBoxParser.MAIL_FIELD_DELIM)
// messageId at position 0, sender at 2, reply-to at 5
.includeFields("101001")
.types(String.class, String.class, String.class);
// extract email addresses and filter out mails from bots
DataSet> addressMails = mails
.map(new EmailExtractor())
.filter(new ExcludeEmailFilter("[email protected]"))
.filter(new ExcludeEmailFilter("[email protected]"));
// construct reply connections by joining on messageId and reply-To
DataSet> replyConnections = addressMails
.join(addressMails).where(2).equalTo(0).projectFirst(1).projectSecond(1);
// count reply connections for each pair of email addresses
replyConnections
.groupBy(0, 1).reduceGroup(new ConnectionCounter())
.print();
}
/**
* Extracts the email address from the sender field
*/
public static class EmailExtractor implements MapFunction, Tuple3> {
@Override
public Tuple3 map(Tuple3 mail) throws Exception {
mail.f1 = mail.f1.substring(mail.f1.lastIndexOf("<") + 1, mail.f1.length() - 1);
return mail;
}
}
/**
* Filter records for a specific email address
*/
public static class ExcludeEmailFilter implements FilterFunction> {
private String filterEmail;
public ExcludeEmailFilter() {}
public ExcludeEmailFilter(String filterMail) {
this.filterEmail = filterMail;
}
@Override
public boolean filter(Tuple3 mail) throws Exception {
return !mail.f1.equals(filterEmail);
}
}
/**
* Count the number of records per group
*/
public static class ConnectionCounter implements GroupReduceFunction, Tuple3> {
Tuple3 outT = new Tuple3<>();
@Override
public void reduce(Iterable> cs, Collector> out) {
outT.f2 = 0;
for(Tuple2 c : cs) {
outT.f0 = c.f0;
outT.f1 = c.f1;
outT.f2 += 1;
}
out.collect(outT);
}
}
}