Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.HarFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.Cluster;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.google.common.base.Charsets;
/**
* a archive creation utility.
* This class provides methods that can be used
* to create hadoop archives. For understanding of
* Hadoop archives look at {@link HarFileSystem}.
*/
public class HadoopArchives implements Tool {
public static final int VERSION = 3;
private static final Logger LOG = LoggerFactory.getLogger(HadoopArchives.class);
private static final String NAME = "har";
private static final String ARCHIVE_NAME = "archiveName";
private static final String REPLICATION = "r";
private static final String PARENT_PATH = "p";
private static final String HELP = "help";
static final String SRC_LIST_LABEL = NAME + ".src.list";
static final String DST_DIR_LABEL = NAME + ".dest.path";
static final String TMP_DIR_LABEL = NAME + ".tmp.dir";
static final String JOB_DIR_LABEL = NAME + ".job.dir";
static final String SRC_COUNT_LABEL = NAME + ".src.count";
static final String TOTAL_SIZE_LABEL = NAME + ".total.size";
static final String DST_HAR_LABEL = NAME + ".archive.name";
static final String SRC_PARENT_LABEL = NAME + ".parent.path";
/** the size of the blocks that will be created when archiving **/
static final String HAR_BLOCKSIZE_LABEL = NAME + ".block.size";
/** the replication factor for the file in archiving. **/
static final String HAR_REPLICATION_LABEL = NAME + ".replication.factor";
/** the size of the part files that will be created when archiving **/
static final String HAR_PARTSIZE_LABEL = NAME + ".partfile.size";
/** size of each part file size **/
long partSize = 2 * 1024 * 1024 * 1024l;
/** size of blocks in hadoop archives **/
long blockSize = 512 * 1024 * 1024l;
/** the desired replication degree; default is 3 **/
short repl = 3;
private static final String usage = "archive"
+ " <-archiveName .har> <-p > [-r ]" +
" * " +
"\n";
private JobConf conf;
public void setConf(Configuration conf) {
if (conf instanceof JobConf) {
this.conf = (JobConf) conf;
} else {
this.conf = new JobConf(conf, HadoopArchives.class);
}
// This is for test purposes since MR2, different from Streaming
// here it is not possible to add a JAR to the classpath the tool
// will when running the mapreduce job.
String testJar = System.getProperty(TEST_HADOOP_ARCHIVES_JAR_PATH, null);
if (testJar != null) {
this.conf.setJar(testJar);
}
}
public Configuration getConf() {
return this.conf;
}
public HadoopArchives(Configuration conf) {
setConf(conf);
}
// check the src paths
private static void checkPaths(Configuration conf, List paths) throws
IOException {
for (Path p : paths) {
FileSystem fs = p.getFileSystem(conf);
fs.getFileStatus(p);
}
}
/**
* this assumes that there are two types of files file/dir
* @param fs the input filesystem
* @param fdir the filestatusdir of the path
* @param out the list of paths output of recursive ls
* @throws IOException
*/
private void recursivels(FileSystem fs, FileStatusDir fdir, List out)
throws IOException {
if (fdir.getFileStatus().isFile()) {
out.add(fdir);
return;
}
else {
out.add(fdir);
FileStatus[] listStatus = fs.listStatus(fdir.getFileStatus().getPath());
fdir.setChildren(listStatus);
for (FileStatus stat: listStatus) {
FileStatusDir fstatDir = new FileStatusDir(stat, null);
recursivels(fs, fstatDir, out);
}
}
}
/** HarEntry is used in the {@link HArchivesMapper} as the input value. */
private static class HarEntry implements Writable {
String path;
String[] children;
HarEntry() {}
HarEntry(String path, String[] children) {
this.path = path;
this.children = children;
}
boolean isDir() {
return children != null;
}
@Override
public void readFields(DataInput in) throws IOException {
path = Text.readString(in);
if (in.readBoolean()) {
children = new String[in.readInt()];
for(int i = 0; i < children.length; i++) {
children[i] = Text.readString(in);
}
} else {
children = null;
}
}
@Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, path);
final boolean dir = isDir();
out.writeBoolean(dir);
if (dir) {
out.writeInt(children.length);
for(String c : children) {
Text.writeString(out, c);
}
}
}
}
/**
* Input format of a hadoop archive job responsible for
* generating splits of the file list
*/
static class HArchiveInputFormat implements InputFormat {
//generate input splits from the src file lists
public InputSplit[] getSplits(JobConf jconf, int numSplits)
throws IOException {
String srcfilelist = jconf.get(SRC_LIST_LABEL, "");
if ("".equals(srcfilelist)) {
throw new IOException("Unable to get the " +
"src file for archive generation.");
}
long totalSize = jconf.getLong(TOTAL_SIZE_LABEL, -1);
if (totalSize == -1) {
throw new IOException("Invalid size of files to archive");
}
//we should be safe since this is set by our own code
Path src = new Path(srcfilelist);
FileSystem fs = src.getFileSystem(jconf);
FileStatus fstatus = fs.getFileStatus(src);
ArrayList splits = new ArrayList(numSplits);
LongWritable key = new LongWritable();
final HarEntry value = new HarEntry();
// the remaining bytes in the file split
long remaining = fstatus.getLen();
// the count of sizes calculated till now
long currentCount = 0L;
// the endposition of the split
long lastPos = 0L;
// the start position of the split
long startPos = 0L;
long targetSize = totalSize/numSplits;
// create splits of size target size so that all the maps
// have equals sized data to read and write to.
try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, src, jconf)) {
while(reader.next(key, value)) {
if (currentCount + key.get() > targetSize && currentCount != 0){
long size = lastPos - startPos;
splits.add(new FileSplit(src, startPos, size, (String[]) null));
remaining = remaining - size;
startPos = lastPos;
currentCount = 0L;
}
currentCount += key.get();
lastPos = reader.getPosition();
}
// the remaining not equal to the target size.
if (remaining != 0) {
splits.add(new FileSplit(src, startPos, remaining, (String[])null));
}
}
return splits.toArray(new FileSplit[splits.size()]);
}
@Override
public RecordReader getRecordReader(InputSplit split,
JobConf job, Reporter reporter) throws IOException {
return new SequenceFileRecordReader(job,
(FileSplit)split);
}
}
private boolean checkValidName(String name) {
Path tmp = new Path(name);
if (tmp.depth() != 1) {
return false;
}
if (name.endsWith(".har"))
return true;
return false;
}
private Path largestDepth(List paths) {
Path deepest = paths.get(0);
for (Path p: paths) {
if (p.depth() > deepest.depth()) {
deepest = p;
}
}
return deepest;
}
/**
* truncate the prefix root from the full path
* @param fullPath the full path
* @param root the prefix root to be truncated
* @return the relative path
*/
private Path relPathToRoot(Path fullPath, Path root) {
// just take some effort to do it
// rather than just using substring
// so that we do not break sometime later
final Path justRoot = new Path(Path.SEPARATOR);
if (fullPath.depth() == root.depth()) {
return justRoot;
}
else if (fullPath.depth() > root.depth()) {
Path retPath = new Path(fullPath.getName());
Path parent = fullPath.getParent();
for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) {
retPath = new Path(parent.getName(), retPath);
parent = parent.getParent();
}
return new Path(justRoot, retPath);
}
return null;
}
/**
* this method writes all the valid top level directories
* into the srcWriter for indexing. This method is a little
* tricky. example-
* for an input with parent path /home/user/ and sources
* as /home/user/source/dir1, /home/user/source/dir2 - this
* will output