All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.examples.scala.relational.TPCHQuery3.scala Maven / Gradle / Ivy

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/

package eu.stratosphere.examples.scala.relational;

import eu.stratosphere.client.LocalExecutor
import eu.stratosphere.api.common.Program
import eu.stratosphere.api.common.ProgramDescription

import eu.stratosphere.api.scala._
import eu.stratosphere.api.scala.operators._

object RunTPCHQuery3 {
  def main(args: Array[String]) {
    val tpch3 = new TPCHQuery3
    if (args.size < 4) {
      println(tpch3.getDescription)
      return
    }
    val plan = tpch3.getScalaPlan(args(0).toInt, args(1), args(2), args(3))
    LocalExecutor.execute(plan)
    System.exit(0)
  }
}

/**
 * The TPC-H is a decision support benchmark on relational data.
 * Its documentation and the data generator (DBGEN) can be found
 * on http://www.tpc.org/tpch/ .
 * The PACT program implements a modified version of the query 3 of
 * the TPC-H benchmark including one join, some filtering and an
 * aggregation.
 * SELECT l_orderkey, o_shippriority, sum(l_extendedprice) as revenue
 *   FROM orders, lineitem
 *   WHERE l_orderkey = o_orderkey
 *     AND o_orderstatus = "X"
 *     AND YEAR(o_orderdate) > Y
 *     AND o_orderpriority LIKE "Z%"
 *   GROUP BY l_orderkey, o_shippriority;
 */
class TPCHQuery3 extends Program with ProgramDescription with Serializable {
  override def getDescription() = {
    "Parameters: [numSubStasks], [orders], [lineitem], [output]"
  }
  override def getPlan(args: String*) = {
    getScalaPlan(args(0).toInt, args(1), args(2), args(3))
  }

  def getScalaPlan(numSubTasks: Int, ordersInput: String, lineItemsInput: String, ordersOutput: String, status: Char = 'F', minYear: Int = 1993, priority: String = "5") = {
    val orders = DataSource(ordersInput, DelimitedInputFormat(parseOrder))
    val lineItems = DataSource(lineItemsInput, DelimitedInputFormat(parseLineItem))

    val filteredOrders = orders filter { o => o.status == status && o.year > minYear && o.orderPriority.startsWith(priority) }
    val prioritizedItems = filteredOrders join lineItems where { _.orderId } isEqualTo { _.orderId } map { (o, li) => PrioritizedOrder(o.orderId, o.shipPriority, li.extendedPrice) }
    val prioritizedOrders = prioritizedItems groupBy { pi => (pi.orderId, pi.shipPriority) } reduceGroup { _ reduce addRevenues }

    val output = prioritizedOrders.write(ordersOutput, DelimitedOutputFormat(formatOutput))

    filteredOrders observes { o => (o.status, o.year, o.orderPriority) }

    prioritizedItems.left neglects { o => o }
    prioritizedItems.left preserves ({ o => (o.orderId, o.shipPriority) }, { pi => (pi.orderId, pi.shipPriority) })

    prioritizedItems.right neglects { li => li }
    prioritizedItems.right preserves ({ li => li.extendedPrice }, { pi => pi.revenue })

    prioritizedOrders observes { po => po.revenue }
    prioritizedOrders preserves ({ pi => (pi.orderId, pi.shipPriority) }, { po => (po.orderId, po.shipPriority) })

    orders.avgBytesPerRecord(44).uniqueKey(_.orderId)
    lineItems.avgBytesPerRecord(28)
    filteredOrders.avgBytesPerRecord(44).avgRecordsEmittedPerCall(0.05f).uniqueKey(_.orderId)
    prioritizedItems.avgBytesPerRecord(32)
    prioritizedOrders.avgBytesPerRecord(32).avgRecordsEmittedPerCall(1)

    val plan = new ScalaPlan(Seq(output), "TPCH Query 3 (Immutable)")
    plan.setDefaultParallelism(numSubTasks)
    plan
  }

  case class Order(orderId: Int, status: Char, year: Int, month: Int, day: Int, orderPriority: String, shipPriority: Int)
  case class LineItem(orderId: Int, extendedPrice: Double)
  case class PrioritizedOrder(orderId: Int, shipPriority: Int, revenue: Double)
  
  def addRevenues(po1: PrioritizedOrder, po2: PrioritizedOrder) = po1.copy(revenue = po1.revenue + po2.revenue)

  def parseOrder = (line: String) => {
    val OrderInputPattern = """(\d+)\|[^\|]+\|([^\|])\|[^\|]+\|(\d\d\d\d)-(\d\d)-(\d\d)\|([^\|]+)\|[^\|]+\|(\d+)\|[^\|]+\|""".r
    val OrderInputPattern(orderId, status, year, month, day, oPr, sPr) = line
    Order(orderId.toInt, status(0), year.toInt, month.toInt, day.toInt, oPr, sPr.toInt)
  }

  def parseLineItem = (line: String) => {
    val LineItemInputPattern = """(\d+)\|[^\|]+\|[^\|]+\|[^\|]+\|[^\|]+\|(\d+\.\d\d)\|[^\|]+\|[^\|]+\|[^\|]\|[^\|]\|[^\|]+\|[^\|]+\|[^\|]+\|[^\|]+\|[^\|]+\|[^\|]+\|""".r
    val LineItemInputPattern(orderId, price) = line
    LineItem(orderId.toInt, price.toDouble)
  }

  def formatOutput = (item: PrioritizedOrder) => "%d|%d|%.2f".format(item.orderId, item.shipPriority, item.revenue)
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy