eu.stratosphere.example.java.relational.RelationalQuery Maven / Gradle / Ivy
/***********************************************************************************************************************
*
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*
**********************************************************************************************************************/
package eu.stratosphere.example.java.relational;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.ExecutionEnvironment;
import eu.stratosphere.api.java.aggregation.Aggregations;
import eu.stratosphere.api.java.functions.FilterFunction;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.api.java.tuple.Tuple3;
import eu.stratosphere.api.java.tuple.Tuple5;
/**
* This program implements the following relational query on the TPC-H data set.
*
*
*
* SELECT l_orderkey, o_shippriority, sum(l_extendedprice) as revenue
* FROM orders, lineitem
* WHERE l_orderkey = o_orderkey
* AND o_orderstatus = "X"
* AND YEAR(o_orderdate) > Y
* AND o_orderpriority LIKE "Z%"
* GROUP BY l_orderkey, o_shippriority;
*
*
*
* Input files are plain text CSV files using the pipe character ('|') as field separator
* as generated by the TPC-H data generator which is available at http://www.tpc.org/tpch/.
*
*
* Usage: RelationalQuery <orders-csv path> <lineitem-csv path> <result path>
*
*
* This example shows how to use:
*
* - tuple data types
*
- inline-defined functions
*
- projection and join projection
*
- build-in aggregation functions
*
*/
@SuppressWarnings("serial")
public class RelationalQuery {
// *************************************************************************
// PROGRAM
// *************************************************************************
private static String STATUS_FILTER = "F";
private static int YEAR_FILTER = 1993;
private static String OPRIO_FILTER = "5";
public static void main(String[] args) throws Exception {
if(!parseParameters(args)) {
return;
}
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// get orders data set: (orderkey, orderstatus, orderdate, orderpriority, shippriority)
DataSet> orders = getOrdersDataSet(env);
// get lineitem data set: (orderkey, extendedprice)
DataSet> lineitems = getLineitemDataSet(env);
// orders filtered by year: (orderkey, custkey)
DataSet> ordersFilteredByYear =
// filter orders
orders.filter(
new FilterFunction>() {
@Override
public boolean filter(Tuple5 t) {
// status filter
if(!t.f1.equals(STATUS_FILTER)) {
return false;
// year filter
} else if(Integer.parseInt(t.f2.substring(0, 4)) <= YEAR_FILTER) {
return false;
// order priority filter
} else if(!t.f3.startsWith(OPRIO_FILTER)) {
return false;
}
return true;
}
})
// project fields out that are no longer required
.project(0,4).types(Integer.class, Integer.class);
// join orders with lineitems: (orderkey, shippriority, extendedprice)
DataSet> lineitemsOfOrders =
ordersFilteredByYear.joinWithHuge(lineitems)
.where(0).equalTo(0)
.projectFirst(0,1).projectSecond(1)
.types(Integer.class, Integer.class, Double.class);
// extendedprice sums: (orderkey, shippriority, sum(extendedprice))
DataSet> priceSums =
// group by order and sum extendedprice
lineitemsOfOrders.groupBy(0,1).aggregate(Aggregations.SUM, 2);
// emit result
priceSums.writeAsCsv(outputPath);
// execute program
env.execute("Relational Query Example");
}
// *************************************************************************
// UTIL METHODS
// *************************************************************************
private static String ordersPath;
private static String lineitemPath;
private static String outputPath;
private static boolean parseParameters(String[] programArguments) {
if(programArguments.length > 0) {
if(programArguments.length == 3) {
ordersPath = programArguments[0];
lineitemPath = programArguments[1];
outputPath = programArguments[2];
} else {
System.err.println("Usage: RelationalQuery ");
return false;
}
} else {
System.err.println("This program expects data from the TPC-H benchmark as input data.\n" +
" Due to legal restrictions, we can not ship generated data.\n" +
" You can find the TPC-H data generator at http://www.tpc.org/tpch/.\n" +
" Usage: RelationalQuery ");
return false;
}
return true;
}
private static DataSet> getOrdersDataSet(ExecutionEnvironment env) {
return env.readCsvFile(ordersPath)
.fieldDelimiter('|')
.includeFields("101011010")
.types(Integer.class, String.class, String.class, String.class, Integer.class);
}
private static DataSet> getLineitemDataSet(ExecutionEnvironment env) {
return env.readCsvFile(lineitemPath)
.fieldDelimiter('|')
.includeFields("1000010000000000")
.types(Integer.class, Double.class);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy