All Downloads are FREE. Search and download functionalities are using the official Maven repository.

pigpen.cascading.core.clj Maven / Gradle / Ivy

The newest version!
;;  Copyright 2015 Netflix, Inc.
;;     Licensed under the Apache License, Version 2.0 (the "License");
;;     you may not use this file except in compliance with the License.
;;     You may obtain a copy of the License at
;;     Unless required by applicable law or agreed to in writing, software
;;     distributed under the License is distributed on an "AS IS" BASIS,
;;     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;;     See the License for the specific language governing permissions and
;;     limitations under the License.

(ns pigpen.cascading.core
  (:import (cascading.flow FlowDef FlowConnector)
           (cascading.operation Identity)
           (cascading.operation.filter Limit Sample)
           (cascading.pipe Pipe Each Every Merge GroupBy CoGroup)
           (cascading.pipe.assembly Unique Rename AggregateBy)
           (cascading.pipe.joiner BufferJoin MixedJoin)
           (cascading.scheme.hadoop TextLine)
           ( AppProps)
           (cascading.tap Tap)
           (cascading.tap.hadoop Hfs)
           (cascading.tuple Fields)
           (cascading.util NullNotEquivalentComparator)
           (pigpen.cascading PigPenFunction PigPenAggregateBy
                             ReduceBuffer GroupBuffer
                             RankBuffer InduceSentinelNils))
  (:require [pigpen.raw :as raw]
            [schema.core :as s]
            [pigpen.model :as m]
            [pigpen.extensions.core :refer [zip]]))

(set! *warn-on-reflection* true)

(AppProps/addApplicationFramework nil
  (str "PigPen:"
       (or (some-> PigPenFunction

(defn cfields
  ^Fields [fields]
  {:pre [(seq fields)]}
  (->> fields
    (map str)
    (into-array String)

(defn group-key-cfields
  #^"[Lcascading.tuple.Fields;" [keys join-nils?]
  (->> keys
    (map (fn [key]
           (let [^Fields fields (cfields [key])]
             (if join-nils?
               (doto fields
                 ;; side effect
                   ^Comparable (str key)
                   ^Comparator (NullNotEquivalentComparator.)))))))
    (into-array Fields)))

(defn pipe-array
  "Type-hinted way to create pipe arrays"
  #^"[Lcascading.pipe.Pipe;" [pipes]
  (into-array Pipe pipes))

;; TODO use quoting instead of pr-str
(defn prepare-expr [expr]
  (case (:type expr)
    :field expr
    :code (-> expr
            (update-in [:init] pr-str)
            (update-in [:func] pr-str))))

(defn prepare-projection [p]
  (update-in p [:expr] prepare-expr))

(defn prepare-projections [ps]
  (mapv prepare-projection ps))

;; can these always be used for both load and store?
(defmulti get-tap :storage)

(defmethod get-tap :string [{:keys [^String location fields args]}]
  (Hfs. (TextLine. ^Fields (cfields (or fields args))) location))

(defmethod get-tap :tap [{:keys [opts]}]
  (get opts :tap))

(defmethod get-tap :default [{:keys [type]}]
  (throw (Exception. (str "Unrecognized tap type: " type))))

;; ******* Commands ********

(defmulti command->flowdef
  "Converts an individual command into the equivalent Cascading flow definition."
  (fn [{:keys [type]} ancestors flowdef] type))

(s/defmethod command->flowdef :load
  [{:keys [id fields], :as command} :- m/Load
   ^FlowDef flowdef]
  (let [^Tap tap (get-tap command)
        pipe (->
               (Pipe. (str id))
               (Rename. (.getSourceFields tap) (cfields fields)))]
    ;; side effect
    (.addSource flowdef pipe tap)

(s/defmethod command->flowdef :store
  [command :- m/Store
   [{:keys [^Pipe pipe ancestor]}]
   ^FlowDef flowdef]
  (let [^Tap sink (get-tap command)
        fields (:fields ancestor)
        ; The tap needs the incoming field name to match (without the namespace
        ; added to all field symbols).
        pipe (if-let [tap-fields (seq (map symbol (.getSinkFields sink)))]
               (Rename. pipe (cfields fields) (cfields tap-fields))
    ;; side effect
    (.addTailSink flowdef pipe sink)

(s/defmethod command->flowdef :reduce
  [command :- m/Reduce
   [{:keys [^Pipe pipe]}]
  (GroupBy. pipe Fields/NONE))

(s/defmethod command->flowdef :reduce-fold
  [{:keys [reduce :- m/Reduce
           fold :- m/Project]}
   [{:keys [^Pipe pipe]}]
  (let [projections (:projections fold)
        context (pr-str `'{:projections ~(prepare-projections projections)})
        old-fields (cfields (get-in projections [0 :expr :args]))
        new-fields (cfields (get-in projections [0 :alias]))]
      (PigPenAggregateBy. context pipe Fields/NONE old-fields)
      (Rename. old-fields new-fields))))

(s/defmethod command->flowdef :group
  [{:keys [id keys fields join-types opts]} :- m/Group
  (let [join-keys (group-key-cfields keys (:join-nils opts))
        pipes (pipe-array
                (if (:join-nils opts)
                  (map :pipe ancestors)
                  ;; This is to induce pigpen's nil-joining behavior, which
                  ;; treats nils from the same relation as equal
                  (->> ancestors
                      (fn [i a]
                        (let [fields (-> a :ancestor :fields cfields)]
                          (Each. ^Pipe (:pipe a)
                                 (InduceSentinelNils. (int i) fields))))))))]
    (CoGroup. (str id) pipes join-keys Fields/NONE (BufferJoin.))))

(s/defmethod command->flowdef :group-fold
  [{:keys [group :- m/Group
           fold :- m/Project]}
  (let [{:keys [id keys]} group
        join-keys (group-key-cfields keys (get-in group [:opts :join-nils]))
        ;; Perform all fold aggregations before co-grouping the relations
        pipes (pipe-array
                (zip [p (-> fold :projections next)
                      {:keys [pipe]} ancestors
                      key keys]
                  (if (some-> p :expr :udf #{:fold})
                    (let [context (pr-str `'{:projections ~(prepare-projections [p])})
                          group-fields (cfields [key])
                          arg-fields (cfields (get-in p [:expr :args]))]
                      (PigPenAggregateBy. context pipe group-fields arg-fields))
    (CoGroup. (str id) pipes join-keys Fields/NONE (BufferJoin.))))

(s/defmethod command->flowdef :join
  [{:keys [id keys fields join-types opts]} :- m/Join
  (let [pipes (->> ancestors
                (map :pipe)
        joiner (->> join-types
                 (map (comp boolean #{:required}))
        join-keys (group-key-cfields keys (:join-nils opts))]
    (CoGroup. (str id) pipes join-keys (cfields fields) joiner)))

(s/defmethod command->flowdef :project
  [{:keys [id projections fields]} :- m/Project
   [{:keys [^Pipe pipe ancestor]}]
  (case (:type ancestor)
    (let [context {:func   (prepare-projection (first projections))
                   :fields fields}]
      (Every. pipe (ReduceBuffer. (pr-str `'~context) (cfields fields)) Fields/RESULTS))

    (:group :group-fold)
    (let [;; the list of args required by this projection
          args (-> projections
                 (get-in [:expr :args]))

          ;; a list of which fields are required. This is to compute inner/outer groups
          required (mapcat (fn [a j] (when (= j :required) [a]))
                           (next args)
                           (:join-types ancestor))

          ;; folds add an extra layer of indirection to field names; this resolves it
          rename-fields (some->> ancestor
                          (map (fn [p]
                                 [(-> p :alias first)
                                  (or (-> p :expr :field)
                                      (-> p :expr :args first))]))
                          (into {}))

          ;; This exists becasue we use this for both fold and non-fold co-groupings.
          ;; For relations that have been folded already, there will only be one value,
          ;; so we take the first. This identifies which relations have been folded.
          folds (some->> ancestor
                  (filter (comp #{:fold} :udf :expr))
                  (map (comp first :alias))
                  (map rename-fields)

          context {:args          args
                   :required      required
                   :rename-fields rename-fields
                   :folds         folds
                   :func          (prepare-projection (first projections))
                   :fields        fields}]

      (Every. pipe (GroupBuffer. (pr-str `'~context) (cfields fields)) Fields/RESULTS))

    (let [field-projections (filter (comp #{:field} :type :expr) projections)
          funcs (filter (comp #{:code} :type :expr) projections)
          context {:field-projections field-projections
                   :func              (prepare-projection (first funcs))
                   :fields            fields}]

      (when (some :flatten field-projections)
        (throw (ex-info "Cascading doesn't support flattened projection fields"
                        {:fields fields})))

      (when (next funcs)
        (throw (ex-info "Cascading doesn't support multiple projection funcs"
                        {:funcs funcs})))

      (when-not (:flatten (first funcs))
        (throw (ex-info "Cascading doesn't support scalar funcs"
                        {:func (first funcs)})))

      (-> (Pipe. (str id) pipe)
        (Each. (PigPenFunction. (pr-str `'~context) (cfields fields)))))))

(s/defmethod command->flowdef :distinct
  [{:keys [fields]} :- m/Distinct
   [{:keys [^Pipe pipe ancestor]}]
  (-> pipe
    (Unique. Fields/ALL)
    (Rename. (cfields (:fields ancestor)) (cfields fields))))

(s/defmethod command->flowdef :take
  [{:keys [n fields]} :- m/Take
   [{:keys [^Pipe pipe ancestor]}]
  (-> pipe
    (Each. (Limit. n))
    (Rename. (cfields (:fields ancestor)) (cfields fields))))

(s/defmethod command->flowdef :sample
  [{:keys [p fields]} :- m/Sample
   [{:keys [^Pipe pipe ancestor]}]
  (-> pipe
    (Each. (Sample. p))
    (Rename. (cfields (:fields ancestor)) (cfields fields))))

(s/defmethod command->flowdef :concat
  [{:keys [fields]} :- m/Concat
  (->> ancestors
    (map (fn [{:keys [^Pipe pipe ancestor]}]
           (Rename. pipe (cfields (:fields ancestor)) (cfields fields))))
    (into-array Pipe)

(s/defmethod command->flowdef :sort
  [{:keys [key comp fields]} :- m/Sort
   [{:keys [^Pipe pipe ancestor]}]
  (let [reverse-order? (= :desc comp)]
    (-> pipe
      (GroupBy. Fields/NONE (cfields [key]) reverse-order?)
      ;; TODO is there a way to rename and select a single field at the same time?
      (Rename. (cfields (next (:fields ancestor))) (cfields fields))
      (Each. (cfields fields) (Identity.) Fields/RESULTS))))

(s/defmethod command->flowdef :rank
  [{:keys [id ancestors fields]} :- m/Rank
   [{:keys [^Pipe pipe]}]
  ; TODO: In this naive, single-reducer implementation, a rank followed by an
  ; sort should skip the sort since rank does a group-by itself.
  (-> pipe
    (GroupBy. Fields/NONE)
    (Every. (RankBuffer. (cfields fields)) Fields/RESULTS)))

(s/defmethod command->flowdef :store-many
  [_ _ _]
  ; No-op, since the flowdef already contains everything needed to handle multiple outputs.

(s/defmethod command->flowdef :noop
  [{:keys [id fields]} :- m/NoOp
   [{:keys [^Pipe pipe ancestor]}]
    (Pipe. (str id) pipe)
    (Rename. (cfields (:fields ancestor)) (cfields fields))))

(defmethod command->flowdef :default
  [command _ _]
  (throw (Exception. (str "Command " (:type command) " not implemented yet for Cascading!"))))

(defn command->flowdef+
  [[flowdef pipes] {:keys [id ancestors], :as command}]
  (let [pipe (command->flowdef command (map pipes ancestors) flowdef)]
    [flowdef (assoc pipes id {:ancestor command, :pipe pipe})]))

(defn commands->flow
  "Transforms a series of commands into a Cascading flow"
  [^FlowConnector connector commands]
  (let [[flowdef _] (reduce command->flowdef+ [(FlowDef/flowDef) {}] commands)]
    (.connect connector flowdef)))

© 2015 - 2024 Weber Informatics LLC | Privacy Policy