com.dataartisans.flinktraining.exercises.table_java.memberotm.MemberOTMonth Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-training-exercises Show documentation
Show all versions of flink-training-exercises Show documentation
Utilities and material for an Apache Flink Training provided by data Artisans.
The newest version!
/*
* Copyright 2015 data Artisans GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dataartisans.flinktraining.exercises.table_java.memberotm;
import com.dataartisans.flinktraining.dataset_preparation.MBoxParser;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.table.BatchTableEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.api.table.Row;
import org.apache.flink.api.table.Table;
import org.apache.flink.api.table.TableEnvironment;
/**
* Java reference implementation for the "Member of the Month" exercise of the Flink training.
* The task of the exercise is to identify for each month the email address that sent the most
* emails to the Flink developer mailing list.
*
* Required parameters:
* --input path-to-input-directory
*
*/
public class MemberOTMonth {
public static void main(String[] args) throws Exception {
// parse parameters
ParameterTool params = ParameterTool.fromArgs(args);
String input = params.getRequired("input");
// obtain an execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// read the "time" and "sender" fields of the input data set (field 2 and 3) as Strings
DataSet> mails =
env.readCsvFile(input)
.lineDelimiter(MBoxParser.MAIL_RECORD_DELIM)
.fieldDelimiter(MBoxParser.MAIL_FIELD_DELIM)
.includeFields("011")
.types(String.class, String.class);
DataSet> monthSender = mails
// extract the month from the time field and the email address from the sender field
.map(new MonthEmailExtractor());
BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
Table mailsPerSenderMonth = tEnv
// to table
.fromDataSet(monthSender, "month, sender")
// filter out bot email addresses
.filter("sender !== '[email protected]' && " +
"sender !== '[email protected]' && " +
"sender !== '[email protected]'")
// count emails per month and email address
.groupBy("month, sender").select("month, sender, month.count as cnt");
Table membersOTMonth = mailsPerSenderMonth
// find max number of emails sent by an address per month
.groupBy("month").select("month as m, cnt.max as max")
// find email address that sent the most emails in each month
.join(mailsPerSenderMonth).where("month = m && cnt = max").select("month, sender");
// print out result
tEnv.toDataSet(membersOTMonth, Row.class).print();
}
public static class MonthEmailExtractor implements MapFunction, Tuple2> {
@Override
public Tuple2 map(Tuple2 mail) throws Exception {
// extract year and month from time string
String month = mail.f0.substring(0, 7);
// extract email address from the sender
String email = mail.f1.substring(mail.f1.lastIndexOf("<") + 1, mail.f1.length() - 1);
return new Tuple2<>(month, email);
}
}
}