All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.indix.pail.PailMigrate.scala Maven / Gradle / Ivy
package com.indix.pail
import java.io.IOException
import java.util
import _root_.util.DateHelper
import com.backtype.hadoop.pail.SequenceFileFormat.SequenceFilePailInputFormat
import com.backtype.hadoop.pail.{PailOutputFormat, PailRecordInfo, PailStructure}
import com.backtype.support.Utils
import com.indix.pail.PailMigrate._
import com.twitter.scalding.Args
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, Text}
import org.apache.hadoop.mapred._
import org.apache.hadoop.util.{Tool, ToolRunner}
import org.apache.log4j.Logger
import org.joda.time.DateTime
class PailMigrate extends Tool {
val logger = Logger.getLogger(this.getClass)
var configuration: Configuration = null
/*
* Takes an input pail location, an output pail location and a output pail spec
* - Setup job to process input pail location
* - Deserialize record
* - Write to output location using the output spec
* - If output dir already exists, just append to it, instead of writing to temp and absorbing
* - Finally, clear out all processed files (disable source removal based on configuration).
*
* OutputFormat - PailOutputFormat - needs a PailSpec
* */
override def run(arguments: Array[String]): Int = {
val args = Args(arguments)
val inputDir = args("input-dir")
val outputDir = args("output-dir")
val targetSpecClass = args("target-pail-spec")
val recordType = args("record-type")
val recordClass = Class.forName(recordType)
val keepSourceFiles = args.boolean("keep-source")
val targetPailStructure = Class.forName(targetSpecClass).newInstance().asInstanceOf[PailStructure[recordClass.type]]
val jobConf = new JobConf(getConf)
// FIXME Make pool and priority configurable
jobConf.setJobName("Pail Migration job (from one scheme to another)")
jobConf.set("mapred.fairscheduler.pool", "hadoop")
jobConf.setJobPriority(JobPriority.VERY_HIGH)
val path: Path = new Path(inputDir)
val fs = path.getFileSystem(getConf)
if(!fs.exists(path)) {
logger.warn("Input directory is not valid/found. Could be migrated or due to a invalid path")
return 0
}
jobConf.setInputFormat(classOf[SequenceFilePailInputFormat])
FileInputFormat.addInputPath(jobConf, new Path(inputDir))
jobConf.setMapOutputKeyClass(classOf[Text])
jobConf.setMapOutputValueClass(classOf[BytesWritable])
jobConf.setOutputFormat(classOf[PailOutputFormat])
FileOutputFormat.setOutputPath(jobConf, new Path(outputDir))
Utils.setObject(jobConf, PailMigrate.OUTPUT_STRUCTURE, targetPailStructure)
jobConf.setMapperClass(classOf[PailMigrateMapper])
jobConf.setReducerClass(classOf[PailMigrateReducer])
jobConf.setNumReduceTasks(200)
jobConf.setJarByClass(this.getClass)
val job = new JobClient(jobConf).submitJob(jobConf)
logger.info(s"Pail Migrate triggered for $inputDir")
logger.info("Submitted job " + job.getID)
while (!job.isComplete) {
Thread.sleep(30 * 1000)
}
if (!job.isSuccessful) throw new IOException("Pail Migrate failed")
if (!keepSourceFiles) {
logger.info(s"Deleting path ${inputDir}")
val deleteStatus = fs.delete(path, true)
if (!deleteStatus)
logger.warn(s"Deleting ${inputDir} failed. \n *** Please delete the source manually ***")
else
logger.info(s"Deleting ${inputDir} completed successfully.")
}
0 // return success, failures throw an exception anyway!
}
override def getConf: Configuration = configuration
override def setConf(configuration: Configuration): Unit = this.configuration = configuration
}
object PailMigrate {
val OUTPUT_STRUCTURE = "pail.migrate.output.structure"
class PailMigrateMapper extends Mapper[PailRecordInfo, BytesWritable, Text, BytesWritable] {
var outputPailStructure: PailStructure[Any] = null
override def map(key: PailRecordInfo, value: BytesWritable, outputCollector: OutputCollector[Text, BytesWritable], reporter: Reporter): Unit = {
val record = outputPailStructure.deserialize(value.getBytes)
val key = new Text(Utils.join(outputPailStructure.getTarget(record), "/"))
outputCollector.collect(key, value)
}
override def close(): Unit = {}
override def configure(jobConf: JobConf): Unit = {
outputPailStructure = Utils.getObject(jobConf, OUTPUT_STRUCTURE).asInstanceOf[PailStructure[Any]]
}
}
class PailMigrateReducer extends Reducer[Text, BytesWritable, Text, BytesWritable] {
override def close(): Unit = {}
override def configure(jobConf: JobConf): Unit = {}
override def reduce(key: Text, iterator: util.Iterator[BytesWritable], outputCollector: OutputCollector[Text, BytesWritable], reporter: Reporter): Unit = {
while (iterator.hasNext)
outputCollector.collect(key, iterator.next())
}
}
}
object PailMigrateUtil {
def main(args: Array[String]) {
ToolRunner.run(new Configuration(), new PailMigrate, args)
}
}
object IxPailArchiver {
val logger = Logger.getLogger(this.getClass)
def main(params: Array[String]) {
val lastWeekBucket = DateHelper.weekInterval(new DateTime(System.currentTimeMillis()).minusDays(14))
val args = Args(params)
val baseInputDir = args("base-input-dir")
val inputDirPath: Path = new Path(baseInputDir, lastWeekBucket)
val configuration = new Configuration()
val fs = inputDirPath.getFileSystem(configuration)
if (fs.exists(inputDirPath)) {
val newParams = params ++ Array("--input-dir", inputDirPath.toString)
ToolRunner.run(configuration, new PailMigrate, newParams)
} else {
logger.info("The following location doesn't exist:" + inputDirPath)
}
}
}