eu.stratosphere.example.java.relational.WebLogAnalysis Maven / Gradle / Ivy
/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.example.java.relational;
import java.util.Iterator;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.ExecutionEnvironment;
import eu.stratosphere.api.java.functions.CoGroupFunction;
import eu.stratosphere.api.java.functions.FilterFunction;
import eu.stratosphere.api.java.tuple.Tuple1;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.api.java.tuple.Tuple3;
import eu.stratosphere.example.java.relational.util.WebLogData;
import eu.stratosphere.example.java.relational.util.WebLogDataGenerator;
import eu.stratosphere.util.Collector;
/**
* This program processes web logs and relational data.
* It implements the following relational query:
*
*
* SELECT
* r.pageURL,
* r.pageRank,
* r.avgDuration
* FROM documents d JOIN rankings r
* ON d.url = r.url
* WHERE CONTAINS(d.text, [keywords])
* AND r.rank > [rank]
* AND NOT EXISTS
* (
* SELECT * FROM Visits v
* WHERE v.destUrl = d.url
* AND v.visitDate < [date]
* );
*
*
*
* Input files are plain text CSV files using the pipe character ('|') as field separator.
* The tables referenced in the query can be generated using the {@link WebLogDataGenerator} and
* have the following schemas
*
* CREATE TABLE Documents (
* url VARCHAR(100) PRIMARY KEY,
* contents TEXT );
*
* CREATE TABLE Rankings (
* pageRank INT,
* pageURL VARCHAR(100) PRIMARY KEY,
* avgDuration INT );
*
* CREATE TABLE Visits (
* sourceIP VARCHAR(16),
* destURL VARCHAR(100),
* visitDate DATE,
* adRevenue FLOAT,
* userAgent VARCHAR(64),
* countryCode VARCHAR(3),
* languageCode VARCHAR(6),
* searchWord VARCHAR(32),
* duration INT );
*
*
*
* This example shows how to use:
*
* - tuple data types
*
- projection and join projection
*
- the CoGroup transformation for an anti-join
*
*
*/
@SuppressWarnings("serial")
public class WebLogAnalysis {
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
parseParameters(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// get input data
DataSet> documents = getDocumentsDataSet(env);
DataSet> ranks = getRanksDataSet(env);
DataSet> visits = getVisitsDataSet(env);
// Create DataSet for filtering the entries from the documents relation
DataSet> filterDocs = documents
.filter(new FilterDocs())
.project(0).types(String.class);
// Create DataSet for filtering the entries from the ranks relation
DataSet> filterRanks = ranks
.filter(new FilterRanks());
// Create DataSet for filtering the entries from the visits relation
DataSet> filterVisits = visits
.filter(new FilterVisits())
.project(0).types(String.class);
// Create DataSet to join the filtered documents and ranks relation
DataSet> joinDocsRanks =
filterDocs.join(filterRanks)
.where(0).equalTo(1)
.projectSecond(0,1,2)
.types(Integer.class, String.class, Integer.class);
// Create DataSet to realize a anti join between the joined
// documents and ranks relation and the filtered visits relation
DataSet> result =
joinDocsRanks.coGroup(filterVisits)
.where(1).equalTo(0)
.with(new AntiJoinVisits());
// emit result
if(fileOutput) {
result.writeAsCsv(outputPath, "\n", "|");
} else {
result.print();
}
// execute program
env.execute("WebLogAnalysis Example");
}
// *************************************************************************
// USER FUNCTIONS
// *************************************************************************
/**
* MapFunction that filters for documents that contain a certain set of
* keywords.
*/
public static class FilterDocs extends FilterFunction> {
private static final String[] KEYWORDS = { " editors ", " oscillations " };
/**
* Filters for documents that contain all of the given keywords and projects the records on the URL field.
*
* Output Format:
* 0: URL
* 1: DOCUMENT_TEXT
*/
@Override
public boolean filter(Tuple2 value) throws Exception {
// FILTER
// Only collect the document if all keywords are contained
String docText = value.f1;
for (String kw : KEYWORDS) {
if (!docText.contains(kw)) {
return false;
}
}
return true;
}
}
/**
* MapFunction that filters for records where the rank exceeds a certain threshold.
*/
public static class FilterRanks extends FilterFunction> {
private static final int RANKFILTER = 40;
/**
* Filters for records of the rank relation where the rank is greater
* than the given threshold.
*
* Output Format:
* 0: RANK
* 1: URL
* 2: AVG_DURATION
*/
@Override
public boolean filter(Tuple3 value) throws Exception {
return (value.f0 > RANKFILTER);
}
}
/**
* MapFunction that filters for records of the visits relation where the year
* (from the date string) is equal to a certain value.
*/
public static class FilterVisits extends FilterFunction> {
private static final int YEARFILTER = 2007;
/**
* Filters for records of the visits relation where the year of visit is equal to a
* specified value. The URL of all visit records passing the filter is emitted.
*
* Output Format:
* 0: URL
* 1: DATE
*/
@Override
public boolean filter(Tuple2 value) throws Exception {
// Parse date string with the format YYYY-MM-DD and extract the year
String dateString = value.f1;
int year = Integer.parseInt(dateString.substring(0,4));
return (year == YEARFILTER);
}
}
/**
* CoGroupFunction that realizes an anti-join.
* If the first input does not provide any pairs, all pairs of the second input are emitted.
* Otherwise, no pair is emitted.
*/
public static class AntiJoinVisits extends CoGroupFunction, Tuple1, Tuple3> {
/**
* If the visit iterator is empty, all pairs of the rank iterator are emitted.
* Otherwise, no pair is emitted.
*
* Output Format:
* 0: RANK
* 1: URL
* 2: AVG_DURATION
*/
@Override
public void coGroup(Iterator> ranks, Iterator> visits, Collector> out) {
// Check if there is a entry in the visits relation
if (!visits.hasNext()) {
while (ranks.hasNext()) {
// Emit all rank pairs
out.collect(ranks.next());
}
}
}
}
// *************************************************************************
// UTIL METHODS
// *************************************************************************
private static boolean fileOutput = false;
private static String documentsPath;
private static String ranksPath;
private static String visitsPath;
private static String outputPath;
private static void parseParameters(String[] args) {
if(args.length > 0) {
fileOutput = true;
if(args.length == 4) {
documentsPath = args[0];
ranksPath = args[1];
visitsPath = args[2];
outputPath = args[3];
} else {
System.err.println("Usage: WebLogAnalysis ");
System.exit(1);
}
} else {
System.out.println("Executing WebLog Analysis example with built-in default data.");
System.out.println(" Provide parameters to read input data from files.");
System.out.println(" See the documentation for the correct format of input files.");
System.out.println(" We provide a data generator to create synthetic input files for this program.");
System.out.println(" Usage: WebLogAnalysis ");
}
}
private static DataSet> getDocumentsDataSet(ExecutionEnvironment env) {
// Create DataSet for documents relation (URL, Doc-Text)
if(fileOutput) {
return env.readCsvFile(documentsPath)
.fieldDelimiter('|')
.types(String.class, String.class);
} else {
return WebLogData.getDocumentDataSet(env);
}
}
private static DataSet> getRanksDataSet(ExecutionEnvironment env) {
// Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration)
if(fileOutput) {
return env.readCsvFile(ranksPath)
.fieldDelimiter('|')
.types(Integer.class, String.class, Integer.class);
} else {
return WebLogData.getRankDataSet(env);
}
}
private static DataSet> getVisitsDataSet(ExecutionEnvironment env) {
// Create DataSet for visits relation (URL, Date)
if(fileOutput) {
return env.readCsvFile(visitsPath)
.fieldDelimiter('|')
.includeFields("011000000")
.types(String.class, String.class);
} else {
return WebLogData.getVisitDataSet(env);
}
}
}