Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
;; Licensed to the Apache Software Foundation (ASF) under one or more
;; contributor license agreements. See the NOTICE file distributed
;; with this work for additional information regarding copyright
;; ownership. The ASF licenses this file to you under the Apache
;; License, Version 2.0 (the "License"); you may not use this file
;; except in compliance with the License. You may obtain a copy of
;; the License at http://www.apache.org/licenses/LICENSE-2.0 Unless
;; required by applicable law or agreed to in writing, software
;; distributed under the License is distributed on an "AS IS" BASIS,
;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
;; implied. See the License for the specific language governing
;; permissions and limitations under the License.
(ns pallet.crate.hadoop
"Pallet crate to manage Hadoop installation and configuration."
(:use [pallet.extensions :only (def-phase-fn phase-fn)])
(:require [pallet.script.lib :as lib]
[pallet.thread-expr :as thread]
[pallet.parameter :as parameter]
[pallet.stevedore :as stevedore]
[pallet.compute :as compute]
[pallet.session :as session]
[pallet.action.directory :as directory]
[pallet.action.exec-script :as exec-script]
[pallet.action.file :as file]
[pallet.action.remote-directory :as remote-directory]
[pallet.action.remote-file :as remote-file]
[pallet.action.user :as user]
[pallet.script :as script]
[clojure.contrib.prxml :as prxml]
[clojure.string :as string]
[clojure.contrib.logging :as log]
[pallet.crate.ssh-key :as ssh-key]
[pallet.crate.java :as java])
(:import [java.io StringReader StringWriter]
[javax.xml.transform TransformerFactory OutputKeys]
[javax.xml.transform.stream StreamSource StreamResult]))
;; #### General Utilities
;;
;; This one is generally quite useful, and may end up in stevedore.
(defn format-exports
"Formats `export` lines for inclusion in a shell script."
[& kv-pairs]
(string/join
(for [[k v] (partition 2 kv-pairs)]
(format "export %s=%s\n" (name k) v))))
;; ## Hadoop Configuration
;;
;; This crate contains all information required to set up and
;; configure a fully functional installation of Apache's Hadoop. It
;; seems that the biggest roadblock potential hadoop adopters face is
;; the confounding, terrible multiplicity of possible
;; configurations. Tom White, in the wonderful [Hadoop: The Definitive
;; Guide](http://goo.gl/nPWWk), states the case well: "Hadoop has a
;; bewildering number of configuration properties".
;;
;; We aim to provide sane, intelligent defaults that adjust themselves
;; based on a given cluster's size and particular distribution of
;; machines.
;;
;; The following phases are informed to some degree by Hadoop's
;; official [Getting Started](http://goo.gl/Bh4zU) page... I have to
;; say, though, Michael G. Noll's [single node](http://goo.gl/8ogSk)
;; and [multiple node](http://goo.gl/NIWoK) hadoop cluster tutorials
;; were immensely helpful.
;; ### Hadoop Defaults
;;
;; For this first version of the crate, we chose to lock down a few
;; parameters that'll be customizable down the road. (We've got a few
;; ideas on how to provide default overrides in a clean way, using
;; environments. Stay tuned!) In particular, we lock down version,
;; Hadoop's final location, and the name of the hadoop user and group
;; to be installed on each machine in the cluster.
(defn versioned-home
"Default Hadoop location, based on version number."
[version]
(format "/usr/local/hadoop-%s" version))
(def default-version "0.20.2")
(def hadoop-home (versioned-home default-version))
(def hadoop-user "hadoop")
(def hadoop-group "hadoop")
;; ### User Creation
;;
;; For the various nodes of a hadoop cluster to communicate with one
;; another, they need to share a common user with common
;; permissions. Something to keep in mind when manually logging in to
;; nodes -- hadoop java processes will run as the `hadoop` user, so
;; calls to `jps` as anyone else will show nothing running. If you'd
;;like to run a test job, ssh into the machine and run
;;
;; `sudo su - hadoop`
;;
;; before interacting with hadoop.
(def-phase-fn create-hadoop-user
"Create a hadoop user on a cluster node. We add the hadoop binary
directory and a `JAVA_HOME` setting to `$PATH` to facilitate
development when manually logged in to some particular node."
[]
(user/group hadoop-group :system true)
(user/user hadoop-user
:system true
:create-home true
:shell :bash)
(remote-file/remote-file (format "/home/%s/.bash_profile" hadoop-user)
:owner hadoop-user
:group hadoop-group
:literal true
:content (format-exports
:JAVA_HOME (stevedore/script (~java/java-home))
:PATH (format "$PATH:%s/bin" hadoop-home))))
;; Once the hadoop user is created, we create an ssh key for that user
;; and share it around the cluster. The jobtracker needs passwordless
;; ssh access into every cluster node running a task tracker, so that
;; it can distribute the data processing code that these machines need
;; to do anything useful.
(defn- get-node-ids-for-group
"Get the id of the nodes in a group node"
[request tag]
(let [nodes (session/nodes-in-group request tag)]
(map compute/id nodes)))
(defn- get-keys-for-group
"Returns the ssh key for a user in a group"
[request tag user]
(for [node (get-node-ids-for-group request tag)]
(parameter/get-for request [:host (keyword node)
:user (keyword user)
:id_rsa])))
(defn- authorize-key
[request local-user group remote-user]
(let [keys (get-keys-for-group request group remote-user)]
(thread/for-> request [key keys]
(ssh-key/authorize-key local-user key))))
(def-phase-fn authorize-groups
"Authorizes the master node to ssh into this node."
[local-users tag-remote-users-map]
(for [local-user local-users
[group remote-users] tag-remote-users-map
remote-user remote-users]
(authorize-key local-user group remote-user)))
;; In the current iteration, `publish-ssh-key` phase should only be
;; called on the job-tracker, and will only work with a subsequent
;; `authorize-jobtracker` phase on the same request. Pallet is
;; stateless between transactions, and the ssh key needs some way to
;; get between nodes. Currently, we store the new ssh key in the request.
(def-phase-fn publish-ssh-key
[]
(expose-request-as
[request]
(let [id (session/target-id request)
tag (session/group-name request)
key-name (format "%s_%s_key" tag id)]
(ssh-key/generate-key hadoop-user :comment key-name)
(ssh-key/record-public-key hadoop-user))))
(def-phase-fn authorize-tag
"configures all nodes to accept passwordless ssh requests from the
node with the supplied tag."
[master-tag]
(let [tag (name master-tag)]
(authorize-groups [hadoop-user] {tag [hadoop-user]})))
;; ### Installation
;;
;; `url` points to Cloudera's installation of Hadoop. Future
;; iterations of this crate will support the default apache build.
;;
;; TODO -- support switching between cloudera and regular. Regular, we
;; should give a version -- cloudera, it doesn't really mean much.
(defn url
"Download URL for the Cloudera CDH3 distribution of Hadoop, generated for
the supplied version."
[version]
(case version
:cloudera (format
"http://archive.cloudera.com/cdh/3/hadoop-%s-cdh3u0.tar.gz" default-version)
:apache (format
"http://www.apache.org/dist/hadoop/core/hadoop-%s/hadoop-%s.tar.gz"
default-version default-version)))
(def-phase-fn install
"First phase to be called when configuring a hadoop cluster. This
phase creates a common hadoop user, and downloads and unpacks the
default Cloudera hadoop distribution."
[build]
(let [url (url build)]
create-hadoop-user
(remote-directory/remote-directory hadoop-home
:url url
:unpack :tar
:tar-options "xz"
:owner hadoop-user
:group hadoop-user)))
;; ### Configuration
;;
;; Hadoop has three main configuration files, each of which are a
;; series of key-value pairs, stored as XML files. Before cluster
;; configuration, we need some way to pretty-print human readable XML
;; representing the configuration properties that we'll store in a
;; clojure map.
(defn ppxml
"Accepts an XML string with no newline formatting and returns the
same XML with pretty-print formatting, as described by Nurullah Akaya
in [this post](http://goo.gl/Y9OVO)."
[xml-str]
(let [in (StreamSource. (StringReader. xml-str))
out (StreamResult. (StringWriter.))
transformer (.newTransformer
(TransformerFactory/newInstance))]
(doseq [[prop val] {OutputKeys/INDENT "yes"
OutputKeys/METHOD "xml"
"{http://xml.apache.org/xslt}indent-amount" "2"}]
(.setOutputProperty transformer prop val))
(.transform transformer in out)
(str (.getWriter out))))
(defn property->xml
"Returns a nested sequence representing the XML for a hadoop
configuration property. if `final?` is true, `true`
is added to the XML entry, preventing any hadoop job from overriding
the property."
[property final?]
[:property (filter
identity
[[:name {} (name (key property))]
[:value {} (val property)]
(when final?
[:final {} "true"])])])
(declare final-properties)
(defn properties->xml
"Converts a map of [property value] entries into a string of XML
with pretty-print formatting."
[properties]
(ppxml
(with-out-str
(prxml/prxml
[:decl! {:version "1.0"}]
[:configuration
(map
#(property->xml % (final-properties (key %)))
properties)]))))
;; ### Sane Defaults
;;
;; As mentioned before, Hadoop configuration can be a bit
;; bewildering. Default values and descriptions of the meaning of each
;; setting can be found here:
;;
;; http://hadoop.apache.org/core/docs/r0.20.0/mapred-default.html
;; http://hadoop.apache.org/core/docs/r0.20.0/hdfs-default.html
;; http://hadoop.apache.org/core/docs/r0.20.0/core-default.html
;;
;; We override a number of these below based on suggestions found in
;; various posts. We'll supply more information on the justification
;; for each of these as we move forward with our dynamic "sane
;; defaults" system.
(defn default-properties
"Returns a nested map of Hadoop default configuration properties,
named according to the 0.20 api."
[name-node-ip job-tracker-ip pid-dir log-dir]
(let [owner-dir (stevedore/script (~lib/user-home ~hadoop-user))
owner-subdir (partial str owner-dir)]
{:hdfs-site {:dfs.data.dir (owner-subdir "/dfs/data")
:dfs.name.dir (owner-subdir "/dfs/name")
:dfs.datanode.du.reserved 1073741824
:dfs.namenode.handler.count 10
:dfs.permissions.enabled true
:dfs.replication 3
:dfs.datanode.max.xcievers 4096}
:mapred-site {:tasktracker.http.threads 46
:mapred.local.dir (owner-subdir "/mapred/local")
:mapred.system.dir "/hadoop/mapred/system"
:mapred.child.java.opts "-Xmx550m"
:mapred.job.tracker (format "%s:8021" job-tracker-ip)
:mapred.job.tracker.handler.count 10
:mapred.map.tasks.speculative.execution true
:mapred.reduce.tasks.speculative.execution false
:mapred.reduce.parallel.copies 10
:mapred.reduce.tasks 5
:mapred.submit.replication 10
:mapred.tasktracker.map.tasks.maximum 2
:mapred.tasktracker.reduce.tasks.maximum 1
:mapred.compress.map.output true
:mapred.output.compression.type "BLOCK"}
:core-site {:fs.checkpoint.dir (owner-subdir "/dfs/secondary")
:fs.default.name (format "hdfs://%s:8020" name-node-ip)
:fs.trash.interval 1440
:io.file.buffer.size 65536
:hadoop.tmp.dir "/tmp/hadoop"
:hadoop.rpc.socket.factory.class.default "org.apache.hadoop.net.StandardSocketFactory"
:hadoop.rpc.socket.factory.class.ClientProtocol ""
:hadoop.rpc.socket.factory.class.JobSubmissionProtocol ""
:io.compression.codecs (str
"org.apache.hadoop.io.compress.DefaultCodec,"
"org.apache.hadoop.io.compress.GzipCodec")}
:hadoop-env {:HADOOP_PID_DIR pid-dir
:HADOOP_LOG_DIR log-dir
:HADOOP_SSH_OPTS "\"-o StrictHostKeyChecking=no\""
:HADOOP_OPTS "\"-Djava.net.preferIPv4Stack=true\""}}))
;; Final properties are properties that can't be overridden during the
;; execution of a job. We're not sure that these are the right
;; properties to lock, as of now -- this will become more clear as we
;; move forward with sane defaults. In the meantime, any suggestions
;; would be much appreciated.
(def final-properties
#{:dfs.block.size
:dfs.data.dir
:dfs.datanode.du.reserved
:dfs.datanode.handler.count
:dfs.hosts
:dfs.hosts.exclude
:dfs.name.dir
:dfs.namenode.handler.count
:dfs.permissions
:fs.checkpoint.dir
:fs.trash.interval
:hadoop.tmp.dir
:mapred.child.ulimit
:mapred.job.tracker.handler.count
:mapred.local.dir
:mapred.tasktracker.map.tasks.maximum
:mapred.tasktracker.reduce.tasks.maximum
:tasktracker.http.threads
:hadoop.rpc.socket.factory.class.default
:hadoop.rpc.socket.factory.class.ClientProtocol
:hadoop.rpc.socket.factory.class.JobSubmissionProtocol})
(def-phase-fn config-files
"Accepts a base directory and a map of [config-filename,
property-map] pairs, and augments the supplied request to allow for
the creation of each referenced configuration file within the base
directory."
[config-dir properties]
(binding [remote-file/force-overwrite true]
(for [[filename props] properties]
(remote-file/remote-file
(format "%s/%s.xml" config-dir (name filename))
:content (properties->xml props)
:owner hadoop-user :group hadoop-group))))
(def merge-config (partial merge-with merge))
(defn merge-and-split-config
"Merges a set of custom hadoop configuration option maps into the
current defaults, and returns a 2-vector where the first item is a
map of *-site files, and the second item is a map of exports for
`hadoop-env.sh`. If a conflict exists, entries in `new-props` knock
out entries in `default-props`."
[default-props new-props]
(let [prop-map (merge-config default-props new-props)
corekey-seq [:core-site :hdfs-site :mapred-site]
envkey-seq [:hadoop-env]]
(map #(select-keys prop-map %) [corekey-seq envkey-seq])))
(def-phase-fn env-file
"Phase that creates the `hadoop-env.sh` file with references to the
supplied pid and log dirs. `hadoop-env.sh` will be placed within the
supplied config directory."
[config-dir env-map]
(for [[fname exports] env-map
:let [fname (name fname)
export-seq (flatten (seq exports))]]
(remote-file/remote-file
(format "%s/%s.sh" config-dir fname)
:content (apply format-exports export-seq))))
;; We do our development on local machines using `vmfest`, which
;; brought us in context with the next problem. Some clouds --
;; Amazon's EC2, for example -- require nodes to be configured with
;; private IP addresses. Hadoop is designed for use within private
;; clusters, so this is typically the right choice. Sometimes,
;; however, public IP addresses are preferable, as in a virtual
;; machine setup.
;;
;; Hadoop takes the IP addresses in `fs.default.name`,
;; `mapred.job.tracker` and performs a reverse DNS lookup, tracking
;; each machine by its hostname. If your cluster isn't set up to
;; handle DNS lookup, you might run into some interesting issues. On
;; these VMs, for example, reverse DNS lookups by the virtual machines
;; on each other caused every VM to resolve to the hostname of my home
;; router. This can cause jobs to limp along or fail msyteriously. We
;; have a workaround involved the `/etc/hosts` file planned for a
;; future iteration.
(defn get-master-ip
"Returns the IP address of a particular type of master node,
as defined by tag. IP-type can be `:private` or `:public`. Function
logs a warning if more than one master exists."
[request ip-type tag]
{:pre [(contains? #{:public :private} ip-type)]}
(let [[master :as nodes] (session/nodes-in-group request tag)
kind (name tag)]
(when (> (count nodes) 1)
(log/warn (format "There are more than one %s" kind)))
(if-not master
(log/error (format "There is no %s defined!" kind))
((case ip-type
:private compute/private-ip
:public compute/primary-ip) master))))
(def-phase-fn configure
"Configures a Hadoop cluster by creating all required default
directories, and populating the proper configuration file
options. The `properties` parameter must be a map of the form
{:core-site {:key val...}
:hdfs-site {:key val ...}
:mapred-site {:key val ...}
:hadoop-env {:export val ...}}
No other top-level keys are supported at this time."
[ip-type namenode-tag jobtracker-tag properties]
(expose-request-as
[request]
(let [conf-dir (str hadoop-home "/conf")
etc-conf-dir (stevedore/script
(str (~lib/config-root) "/hadoop"))
nn-ip (get-master-ip request ip-type namenode-tag)
jt-ip (get-master-ip request ip-type jobtracker-tag)
pid-dir (stevedore/script (str (~lib/pid-root) "/hadoop"))
log-dir (stevedore/script (str (~lib/log-root) "/hadoop"))
defaults (default-properties nn-ip jt-ip pid-dir log-dir)
[props env] (merge-and-split-config defaults properties)
tmp-dir (get-in properties [:core-site :hadoop.tmp.dir])]
(for [path [conf-dir tmp-dir log-dir pid-dir]]
(directory/directory path
:owner hadoop-user
:group hadoop-group
:mode "0755"))
(file/symbolic-link conf-dir etc-conf-dir)
(config-files conf-dir props)
(env-file conf-dir env))))
;; The following script allows for proper transmission of SSH
;; commands, with hadoop's required `JAVA_HOME` property all set.
(script/defscript as-user [user & command])
(script/defimpl as-user :default [user & command]
(su -s "/bin/bash" ~user
-c "\"" (str "export JAVA_HOME=" (~java/java-home) ";") ~@command "\""))
(script/defimpl as-user [#{:yum}] [user & command]
("/sbin/runuser" -s "/bin/bash" - ~user -c ~@command))
;; Hadoop services, or `roles`, are all run by the `hadoop-daemon.sh`
;; command. Other scripts exist, such as `hadoop-daemons.sh` (for
;; running commands on many nodes at once), but pallet takes over for
;; a good number of these. The following `phase-fn` takes care to only
;; start a hadoop service that's not already running for the `hadoop`
;; user. Future iterations may provide the ability to force some
;; daemon service to restart.
(def-phase-fn hadoop-service
"Run a Hadoop service"
[hadoop-daemon description]
(exec-script/exec-checked-script
(str "Start Hadoop " description)
(~as-user
~hadoop-user
~(stevedore/script
(if-not (pipe (jps)
(grep "-i" ~hadoop-daemon))
((str ~hadoop-home "/bin/hadoop-daemon.sh")
"start"
~hadoop-daemon))))))
(def-phase-fn hadoop-command
"Runs '$ hadoop `args`' on each machine in the request. Command runs
as the hadoop user."
[& args]
(exec-script/exec-checked-script
(apply str "hadoop " (interpose " " args))
(~as-user
~hadoop-user
(str ~hadoop-home "/bin/hadoop")
~@args)))
;; `format-hdfs` is, effectively, a call to
;;
;; `(hadoop-command "namenode" "-format")
;;
;; that call would only work the first time, however. On subsequent
;;format requests, hadoop tells the user that the namenode has already
;;been formatted, and asks for confirmation. The current version of
;;`format-namenode` sends a default "N" every time.
(def-phase-fn format-hdfs
"Formats HDFS for the first time. If HDFS has already been
formatted, does nothing."
[]
(exec-script/exec-script
(~as-user ~hadoop-user
(pipe
(echo "N")
((str ~hadoop-home "/bin/hadoop")
"namenode"
"-format")))))
;; And, here we are at the end! The following five functions activate
;; each of the five distinct roles that hadoop nodes may take on.
(def-phase-fn name-node
"Collection of all subphases required for a namenode."
[data-dir]
format-hdfs
(hadoop-service "namenode" "Name Node")
(hadoop-command "dfsadmin" "-safemode" "wait")
(hadoop-command "fs" "-mkdir" data-dir)
(hadoop-command "fs" "-chmod" "+w" data-dir))
(def-phase-fn secondary-name-node []
(hadoop-service "secondarynamenode" "secondary name node"))
(def-phase-fn job-tracker []
(hadoop-service "jobtracker" "job tracker"))
(def-phase-fn data-node []
(hadoop-service "datanode" "data node"))
(def-phase-fn task-tracker []
(hadoop-service "tasktracker" "task tracker"))