com.dataartisans.flinktraining.exercises.dataset_java.tf_idf.MailTFIDF Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-training-exercises Show documentation
Utilities and material for an Apache Flink Training provided by data Artisans.
The newest version!
/*
 * Copyright 2015 data Artisans GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dataartisans.flinktraining.exercises.dataset_java.tf_idf;

import com.dataartisans.flinktraining.dataset_preparation.MBoxParser;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.util.Collector;

import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Java reference implementation for the "TF-IDF" exercise of the Flink training.
 * The task of the exercise is to compute the TF-IDF score for words in mails of the
 * Apache Flink developer mailing list archive.
 *
 * Required parameters:
 *   --input path-to-input-directory
 *
 */
public class MailTFIDF {

	public final static String[] STOP_WORDS = {
			"the", "i", "a", "an", "at", "are", "am", "for", "and", "or", "is",
			"there", "it", "this", "that", "on", "was", "by", "of", "to", "in",
			"to", "message", "not", "be", "with", "you", "have", "as", "can"
	};

	public static void main(String[] args) throws Exception {

		// parse parameters
		ParameterTool params = ParameterTool.fromArgs(args);
		String input = params.getRequired("input");

		// obtain execution environment
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// read messageId and body fields from the input data set
		DataSet> mails =
				env.readCsvFile(input)
						.lineDelimiter(MBoxParser.MAIL_RECORD_DELIM)
						.fieldDelimiter(MBoxParser.MAIL_FIELD_DELIM)
						.includeFields("10001")
						.types(String.class, String.class);

		// count the number of mails
		long mailCount = mails.count();

		// compute for each word the number mails it is contained in (DF)
		DataSet> docFrequency = mails
				// extract unique words from mails
				.flatMap(new UniqueWordExtractor(STOP_WORDS))
				// compute the frequency of words
				.groupBy(0).sum(1);

		// compute the frequency of words within each mail (TF)
		DataSet> termFrequency = mails
				.flatMap(new TFComputer(STOP_WORDS));

		// compute the TF-IDF score for each word-mail pair
		DataSet> tfIdf = docFrequency
				// join TF and DF on word fields
				.join(termFrequency).where(0).equalTo(1)
					// compute TF-IDF
					.with(new TfIdfComputer(mailCount));

		// print result
		tfIdf
				.print();
	}

	/**
	 * Extracts the all unique words from the mail body.
	 * Words consist only of alphabetical characters. Frequent words (stop words) are filtered out.
	 */
	public static class UniqueWordExtractor extends RichFlatMapFunction, Tuple2> {

		// set of stop words
		private Set stopWords;
		// set of emitted words
		private transient Set emittedWords;
		// pattern to match against words
		private transient Pattern wordPattern;

		public UniqueWordExtractor() {
			this.stopWords = new HashSet<>();
		}

		public UniqueWordExtractor(String[] stopWords) {
			// setup stop words set
			this.stopWords = new HashSet<>();
			Collections.addAll(this.stopWords, stopWords);
		}

		@Override
		public void open(Configuration config) {
			// init set and word pattern
			this.emittedWords = new HashSet<>();
			this.wordPattern = Pattern.compile("(\\p{Alpha})+");
		}

		@Override
		public void flatMap(Tuple2 mail, Collector> out) throws Exception {

			// clear set of emitted words
			this.emittedWords.clear();
			// split body along whitespaces into tokens
			StringTokenizer st = new StringTokenizer(mail.f1);

			// for each word candidate
			while(st.hasMoreTokens()) {
				// normalize to lower case
				String word = st.nextToken().toLowerCase();
				Matcher m = this.wordPattern.matcher(word);
				if(m.matches() && !this.stopWords.contains(word) && !this.emittedWords.contains(word)) {
					// candidate matches word pattern, is not a stop word, and was not emitted before
					out.collect(new Tuple2<>(word, 1));
					this.emittedWords.add(word);
				}
			}
		}
	}

	/**
	 * Computes the frequency of words in a mails body.
	 * Words consist only of alphabetical characters. Frequent words (stop words) are filtered out.
	 */
	public static class TFComputer extends RichFlatMapFunction, Tuple3> {

		// set of stop words
		private Set stopWords;
		// map to count the frequency of words
		private transient Map wordCounts;
		// pattern to match against words
		private transient Pattern wordPattern;

		public TFComputer() {
			this.stopWords = new HashSet<>();
		}

		public TFComputer(String[] stopWords) {
			// initialize stop words
			this.stopWords = new HashSet<>();
			Collections.addAll(this.stopWords, stopWords);
		}

		@Override
		public void open(Configuration config) {
			// initialized map and pattern
			this.wordPattern = Pattern.compile("(\\p{Alpha})+");
			this.wordCounts = new HashMap<>();
		}

		@Override
		public void flatMap(Tuple2 mail, Collector> out) throws Exception {

			// clear count map
			this.wordCounts.clear();

			// tokenize mail body along whitespaces
			StringTokenizer st = new StringTokenizer(mail.f1);
			// for each candidate word
			while(st.hasMoreTokens()) {
				// normalize to lower case
				String word = st.nextToken().toLowerCase();
				Matcher m = this.wordPattern.matcher(word);
				if(m.matches() && !this.stopWords.contains(word)) {
					// word matches pattern and is not a stop word -> increase word count
					int count = 0;
					if(wordCounts.containsKey(word)) {
						count = wordCounts.get(word);
					}
					wordCounts.put(word, count + 1);
				}
			}

			// emit all counted words
			for(String word : this.wordCounts.keySet()) {
				out.collect(new Tuple3<>(mail.f0, word, this.wordCounts.get(word)));
			}
		}
	}

	/**
	 * Compute the TF-IDF score for a word in a mail by combining TF, DF, and total mail count.
	 */
	public static class TfIdfComputer implements JoinFunction, Tuple3, Tuple3> {

		private double mailCount;

		public TfIdfComputer() {}

		public TfIdfComputer(long mailCount) {
			this.mailCount = (double)mailCount;
		}

		@Override
		public Tuple3 join(Tuple2 docFreq, Tuple3 termFreq) throws Exception {
			// compute TF-IDF
			return new Tuple3<>(
					termFreq.f0, // messageID
					termFreq.f1, // word
					termFreq.f2 * (mailCount / docFreq.f1) // TF-IDF
			);
		}
	}
}