org.apache.kylin.engine.mr.common.AbstractHadoopJob Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kylin.engine.mr.common;
/**
* @author George Song (ysong1)
*
*/
import static org.apache.hadoop.util.StringUtils.formatTime;
import static org.apache.kylin.engine.mr.common.JobRelatedMetaUtil.collectCubeMetadata;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.ClassUtil;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.KylinConfig.SetAndUnsetThreadLocalConfig;
import org.apache.kylin.common.KylinConfigExt;
import org.apache.kylin.common.StorageURL;
import org.apache.kylin.common.util.CliCommandExecutor;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.OptionsHelper;
import org.apache.kylin.common.util.StringSplitter;
import org.apache.kylin.common.util.StringUtil;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.job.JobInstance;
import org.apache.kylin.job.exception.JobException;
import org.apache.kylin.metadata.model.TableDesc;
import org.apache.kylin.metadata.project.ProjectManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Maps;
@SuppressWarnings("static-access")
public abstract class AbstractHadoopJob extends Configured implements Tool {
private static final Logger logger = LoggerFactory.getLogger(AbstractHadoopJob.class);
protected static final Option OPTION_PROJECT = OptionBuilder.withArgName(BatchConstants.ARG_PROJECT).hasArg()
.isRequired(true).withDescription("Project name.").create(BatchConstants.ARG_PROJECT);
protected static final Option OPTION_JOB_NAME = OptionBuilder.withArgName(BatchConstants.ARG_JOB_NAME).hasArg()
.isRequired(true).withDescription("Job name. For example, Kylin_Cuboid_Builder-clsfd_v2_Step_22-D)")
.create(BatchConstants.ARG_JOB_NAME);
protected static final Option OPTION_CUBE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_CUBE_NAME).hasArg()
.isRequired(true).withDescription("Cube name. For exmaple, flat_item_cube")
.create(BatchConstants.ARG_CUBE_NAME);
protected static final Option OPTION_CUBING_JOB_ID = OptionBuilder.withArgName(BatchConstants.ARG_CUBING_JOB_ID)
.hasArg().isRequired(false).withDescription("ID of cubing job executable")
.create(BatchConstants.ARG_CUBING_JOB_ID);
// @Deprecated
protected static final Option OPTION_SEGMENT_NAME = OptionBuilder.withArgName(BatchConstants.ARG_SEGMENT_NAME)
.hasArg().isRequired(true).withDescription("Cube segment name").create(BatchConstants.ARG_SEGMENT_NAME);
protected static final Option OPTION_SEGMENT_ID = OptionBuilder.withArgName(BatchConstants.ARG_SEGMENT_ID).hasArg()
.isRequired(true).withDescription("Cube segment id").create(BatchConstants.ARG_SEGMENT_ID);
protected static final Option OPTION_INPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_INPUT).hasArg()
.isRequired(true).withDescription("Input path").create(BatchConstants.ARG_INPUT);
protected static final Option OPTION_INPUT_FORMAT = OptionBuilder.withArgName(BatchConstants.ARG_INPUT_FORMAT)
.hasArg().isRequired(false).withDescription("Input format").create(BatchConstants.ARG_INPUT_FORMAT);
protected static final Option OPTION_OUTPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_OUTPUT).hasArg()
.isRequired(true).withDescription("Output path").create(BatchConstants.ARG_OUTPUT);
protected static final Option OPTION_DICT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_DICT_PATH).hasArg()
.isRequired(false).withDescription("Dict path").create(BatchConstants.ARG_DICT_PATH);
protected static final Option OPTION_NCUBOID_LEVEL = OptionBuilder.withArgName(BatchConstants.ARG_LEVEL).hasArg()
.isRequired(true).withDescription("N-Cuboid build level, e.g. 1, 2, 3...").create(BatchConstants.ARG_LEVEL);
protected static final Option OPTION_PARTITION_FILE_PATH = OptionBuilder.withArgName(BatchConstants.ARG_PARTITION)
.hasArg().isRequired(true).withDescription("Partition file path.").create(BatchConstants.ARG_PARTITION);
protected static final Option OPTION_HTABLE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_HTABLE_NAME)
.hasArg().isRequired(true).withDescription("HTable name").create(BatchConstants.ARG_HTABLE_NAME);
protected static final Option OPTION_DICTIONARY_SHRUNKEN_PATH = OptionBuilder
.withArgName(BatchConstants.ARG_SHRUNKEN_DICT_PATH).hasArg().isRequired(false)
.withDescription("Dictionary shrunken path").create(BatchConstants.ARG_SHRUNKEN_DICT_PATH);
protected static final Option OPTION_STATISTICS_OUTPUT = OptionBuilder.withArgName(BatchConstants.ARG_STATS_OUTPUT)
.hasArg().isRequired(false).withDescription("Statistics output").create(BatchConstants.ARG_STATS_OUTPUT);
protected static final Option OPTION_STATISTICS_SAMPLING_PERCENT = OptionBuilder
.withArgName(BatchConstants.ARG_STATS_SAMPLING_PERCENT).hasArg().isRequired(false)
.withDescription("Statistics sampling percentage").create(BatchConstants.ARG_STATS_SAMPLING_PERCENT);
protected static final Option OPTION_CUBOID_MODE = OptionBuilder.withArgName(BatchConstants.ARG_CUBOID_MODE)
.hasArg().isRequired(false).withDescription("Cuboid Mode").create(BatchConstants.ARG_CUBOID_MODE);
protected static final Option OPTION_NEED_UPDATE_BASE_CUBOID_SHARD = OptionBuilder
.withArgName(BatchConstants.ARG_UPDATE_SHARD).hasArg().isRequired(false)
.withDescription("If need to update base cuboid shard").create(BatchConstants.ARG_UPDATE_SHARD);
protected static final Option OPTION_TABLE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_TABLE_NAME).hasArg().isRequired(true).withDescription("Table name. For exmaple, default.table1").create(BatchConstants.ARG_TABLE_NAME);
protected static final Option OPTION_LOOKUP_SNAPSHOT_ID = OptionBuilder.withArgName(BatchConstants.ARG_LOOKUP_SNAPSHOT_ID).hasArg()
.isRequired(true).withDescription("Lookup table snapshotID")
.create(BatchConstants.ARG_LOOKUP_SNAPSHOT_ID);
protected static final Option OPTION_META_URL = OptionBuilder.withArgName(BatchConstants.ARG_META_URL)
.hasArg().isRequired(true).withDescription("HDFS metadata url").create(BatchConstants.ARG_META_URL);
public static final Option OPTION_HBASE_CONF_PATH = OptionBuilder.withArgName(BatchConstants.ARG_HBASE_CONF_PATH).hasArg()
.isRequired(true).withDescription("HBase config file path").create(BatchConstants.ARG_HBASE_CONF_PATH);
private static final String MAP_REDUCE_CLASSPATH = "mapreduce.application.classpath";
private static final Map kylinConfigCache = Maps.newConcurrentMap();
protected static void runJob(Tool job, String[] args) {
try {
int exitCode = ToolRunner.run(job, args);
System.exit(exitCode);
} catch (Exception e) {
e.printStackTrace(System.err);
System.exit(5);
}
}
// ============================================================================
protected String name;
protected boolean isAsync = false;
protected OptionsHelper optionsHelper = new OptionsHelper();
protected Job job;
public AbstractHadoopJob() {
super(HadoopUtil.getCurrentConfiguration());
}
protected void parseOptions(Options options, String[] args) throws ParseException {
optionsHelper.parseOptions(options, args);
}
public void printUsage(Options options) {
optionsHelper.printUsage(getClass().getSimpleName(), options);
}
public Option[] getOptions() {
return optionsHelper.getOptions();
}
public String getOptionsAsString() {
return optionsHelper.getOptionsAsString();
}
protected String getOptionValue(Option option) {
return optionsHelper.getOptionValue(option);
}
protected boolean hasOption(Option option) {
return optionsHelper.hasOption(option);
}
protected int waitForCompletion(Job job) throws IOException, InterruptedException, ClassNotFoundException {
int retVal = 0;
long start = System.nanoTime();
if (isAsync) {
job.submit();
} else {
job.waitForCompletion(true);
retVal = job.isSuccessful() ? 0 : 1;
logger.debug("Job '" + job.getJobName() + "' finished "
+ (job.isSuccessful() ? "successfully in " : "with failures. Time taken ")
+ formatTime((System.nanoTime() - start) / 1000000L));
}
return retVal;
}
protected void setJobClasspath(Job job, KylinConfig kylinConf) {
String jarPath = kylinConf.getKylinJobJarPath();
File jarFile = new File(jarPath);
if (jarFile.exists()) {
job.setJar(jarPath);
logger.trace("append job jar: " + jarPath);
} else {
job.setJarByClass(this.getClass());
}
String kylinHiveDependency = System.getProperty("kylin.hive.dependency");
String kylinKafkaDependency = System.getProperty("kylin.kafka.dependency");
Configuration jobConf = job.getConfiguration();
if (kylinConf.isUseLocalClasspathEnabled()) {
String classpath = jobConf.get(MAP_REDUCE_CLASSPATH);
if (classpath == null || classpath.length() == 0) {
logger.info("Didn't find " + MAP_REDUCE_CLASSPATH
+ " in job configuration, will run 'mapred classpath' to get the default value.");
classpath = getDefaultMapRedClasspath();
logger.info("The default mapred classpath is: " + classpath);
}
jobConf.set(MAP_REDUCE_CLASSPATH, classpath);
}
logger.trace("Hadoop job classpath is: " + job.getConfiguration().get(MAP_REDUCE_CLASSPATH));
/*
* set extra dependencies as tmpjars & tmpfiles if configured
*/
StringBuilder kylinDependency = new StringBuilder();
// for hive dependencies
if (kylinHiveDependency != null) {
// yarn classpath is comma separated
kylinHiveDependency = kylinHiveDependency.replace(":", ",");
logger.trace("Hive Dependencies Before Filtered: " + kylinHiveDependency);
String filteredHive = filterKylinHiveDependency(kylinHiveDependency, kylinConf);
logger.trace("Hive Dependencies After Filtered: " + filteredHive);
StringUtil.appendWithSeparator(kylinDependency, filteredHive);
} else {
logger.debug("No hive dependency jars set in the environment, will find them from classpath:");
try {
String hiveExecJarPath = ClassUtil.findContainingJar(Class.forName("org.apache.hadoop.hive.ql.Driver"));
StringUtil.appendWithSeparator(kylinDependency, hiveExecJarPath);
logger.debug("hive-exec jar file: " + hiveExecJarPath);
String hiveHCatJarPath = ClassUtil
.findContainingJar(Class.forName("org.apache.hive.hcatalog.mapreduce.HCatInputFormat"));
StringUtil.appendWithSeparator(kylinDependency, hiveHCatJarPath);
logger.debug("hive-catalog jar file: " + hiveHCatJarPath);
String hiveMetaStoreJarPath = ClassUtil
.findContainingJar(Class.forName("org.apache.hadoop.hive.metastore.api.Table"));
StringUtil.appendWithSeparator(kylinDependency, hiveMetaStoreJarPath);
logger.debug("hive-metastore jar file: " + hiveMetaStoreJarPath);
} catch (ClassNotFoundException e) {
logger.error("Cannot found hive dependency jars: " + e);
}
}
// for kafka dependencies
if (kylinKafkaDependency != null) {
kylinKafkaDependency = kylinKafkaDependency.replace(":", ",");
logger.trace("Kafka Dependencies: " + kylinKafkaDependency);
StringUtil.appendWithSeparator(kylinDependency, kylinKafkaDependency);
} else {
logger.debug("No Kafka dependency jar set in the environment, will find them from classpath:");
try {
String kafkaClientJarPath = ClassUtil
.findContainingJar(Class.forName("org.apache.kafka.clients.consumer.KafkaConsumer"));
StringUtil.appendWithSeparator(kylinDependency, kafkaClientJarPath);
logger.debug("kafka jar file: " + kafkaClientJarPath);
} catch (ClassNotFoundException e) {
logger.debug("Not found kafka client jar from classpath, it is optional for normal build: " + e);
}
}
// for KylinJobMRLibDir
String mrLibDir = kylinConf.getKylinJobMRLibDir();
logger.trace("MR additional lib dir: " + mrLibDir);
StringUtil.appendWithSeparator(kylinDependency, mrLibDir);
setJobTmpJarsAndFiles(job, kylinDependency.toString());
}
private String filterKylinHiveDependency(String kylinHiveDependency, KylinConfig config) {
if (StringUtils.isBlank(kylinHiveDependency))
return "";
StringBuilder jarList = new StringBuilder();
Pattern hivePattern = Pattern.compile(config.getHiveDependencyFilterList());
Matcher matcher = hivePattern.matcher(kylinHiveDependency);
while (matcher.find()) {
if (jarList.length() > 0)
jarList.append(",");
jarList.append(matcher.group());
}
return jarList.toString();
}
private void setJobTmpJarsAndFiles(Job job, String kylinDependency) {
if (StringUtils.isBlank(kylinDependency))
return;
logger.trace("setJobTmpJarsAndFiles: " + kylinDependency);
try {
Configuration jobConf = job.getConfiguration();
FileSystem localfs = FileSystem.getLocal(jobConf);
FileSystem hdfs = HadoopUtil.getWorkingFileSystem(jobConf);
StringBuilder jarList = new StringBuilder();
StringBuilder fileList = new StringBuilder();
for (String fileName : StringUtil.splitAndTrim(kylinDependency, ",")) {
Path p = new Path(fileName);
if (p.isAbsolute() == false) {
logger.warn("The directory of kylin dependency '" + fileName + "' is not absolute, skip");
continue;
}
FileSystem fs;
if (exists(hdfs, p)) {
fs = hdfs;
} else if (exists(localfs, p)) {
fs = localfs;
} else {
logger.warn("The directory of kylin dependency '" + fileName + "' does not exist, skip");
continue;
}
if (fs.getFileStatus(p).isDirectory()) {
logger.trace("Expanding depedency directory: " + p);
appendTmpDir(job, fs, p, jarList, fileList);
continue;
}
StringBuilder list = (p.getName().endsWith(".jar")) ? jarList : fileList;
if (list.length() > 0)
list.append(",");
list.append(fs.getFileStatus(p).getPath());
}
appendTmpFiles(fileList.toString(), jobConf);
appendTmpJars(jarList.toString(), jobConf);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private void appendTmpDir(Job job, FileSystem fs, Path tmpDir, StringBuilder jarList, StringBuilder fileList) {
try {
FileStatus[] fList = fs.listStatus(tmpDir);
for (FileStatus file : fList) {
Path p = file.getPath();
if (fs.getFileStatus(p).isDirectory()) {
appendTmpDir(job, fs, p, jarList, fileList);
continue;
}
StringBuilder list = (p.getName().endsWith(".jar")) ? jarList : fileList;
if (list.length() > 0)
list.append(",");
list.append(fs.getFileStatus(p).getPath().toString());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private void appendTmpJars(String jarList, Configuration conf) {
if (StringUtils.isBlank(jarList))
return;
String tmpJars = conf.get("tmpjars", null);
if (tmpJars == null) {
tmpJars = jarList;
} else {
tmpJars += "," + jarList;
}
conf.set("tmpjars", tmpJars);
logger.trace("Job 'tmpjars' updated -- " + tmpJars);
}
private void appendTmpFiles(String fileList, Configuration conf) {
if (StringUtils.isBlank(fileList))
return;
String tmpFiles = conf.get("tmpfiles", null);
if (tmpFiles == null) {
tmpFiles = fileList;
} else {
tmpFiles += "," + fileList;
}
conf.set("tmpfiles", tmpFiles);
logger.trace("Job 'tmpfiles' updated -- " + tmpFiles);
}
private String getDefaultMapRedClasspath() {
String classpath = "";
try {
CliCommandExecutor executor = KylinConfig.getInstanceFromEnv().getCliCommandExecutor();
String output = executor.execute("mapred classpath").getSecond();
classpath = output.trim().replace(':', ',');
} catch (IOException e) {
logger.error("Failed to run: 'mapred classpath'.", e);
}
return classpath;
}
private static boolean exists(FileSystem fs, Path p) throws IOException {
try {
return fs.exists(p);
} catch (IllegalArgumentException ex) {
// can happen when FS mismatch
return false;
}
}
public static int addInputDirs(String input, Job job) throws IOException {
int folderNum = addInputDirs(StringSplitter.split(input, ","), job);
logger.info("Number of added folders:" + folderNum);
return folderNum;
}
public static int addInputDirs(String[] inputs, Job job) throws IOException {
int ret = 0;//return number of added folders
for (String inp : inputs) {
inp = inp.trim();
if (inp.endsWith("/*")) {
inp = inp.substring(0, inp.length() - 2);
FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
Path path = new Path(inp);
if (!exists(fs, path)) {
logger.warn("Path not exist:" + path.toString());
continue;
}
FileStatus[] fileStatuses = fs.listStatus(path);
boolean hasDir = false;
for (FileStatus stat : fileStatuses) {
if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
hasDir = true;
ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
}
}
if (fileStatuses.length > 0 && !hasDir) {
ret += addInputDirs(new String[] { path.toString() }, job);
}
} else {
logger.trace("Add input " + inp);
FileInputFormat.addInputPath(job, new Path(inp));
ret++;
}
}
return ret;
}
public static KylinConfig loadKylinPropsAndMetadata() throws IOException {
File metaDir = new File("meta");
if (!metaDir.getAbsolutePath().equals(System.getProperty(KylinConfig.KYLIN_CONF))) {
System.setProperty(KylinConfig.KYLIN_CONF, metaDir.getAbsolutePath());
logger.info("The absolute path for meta dir is " + metaDir.getAbsolutePath());
KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
Map paramsMap = new HashMap<>();
paramsMap.put("path", metaDir.getAbsolutePath());
StorageURL storageURL = new StorageURL(kylinConfig.getMetadataUrl().getIdentifier(), "ifile", paramsMap);
kylinConfig.setMetadataUrl(storageURL.toString());
return kylinConfig;
} else {
return KylinConfig.getInstanceFromEnv();
}
}
public static KylinConfig loadKylinConfigFromHdfs(SerializableConfiguration conf, String uri) {
HadoopUtil.setCurrentConfiguration(conf.get());
KylinConfig config = loadKylinConfigFromHdfs(uri);
// This is a bad example where the thread local KylinConfig cannot be auto-closed due to
// limitation of MR API. It works because MR task runs its own process. Do not copy.
@SuppressWarnings("unused")
SetAndUnsetThreadLocalConfig shouldAutoClose = KylinConfig.setAndUnsetThreadLocalConfig(config);
return config;
}
public static KylinConfig loadKylinConfigFromHdfs(String uri) {
if (uri == null)
throw new IllegalArgumentException("meta url should not be null");
if (!uri.contains("@hdfs"))
throw new IllegalArgumentException("meta url should like @hdfs schema");
if (kylinConfigCache.get(uri) != null) {
logger.info("KylinConfig cached for : {}", uri);
return kylinConfigCache.get(uri);
}
logger.info("Ready to load KylinConfig from uri: {}", uri);
KylinConfig config;
FileSystem fs;
String realHdfsPath = StorageURL.valueOf(uri).getParameter("path") + "/" + KylinConfig.KYLIN_CONF_PROPERTIES_FILE;
try {
fs = HadoopUtil.getFileSystem(realHdfsPath);
InputStream is = fs.open(new Path(realHdfsPath));
Properties prop = KylinConfig.streamToProps(is);
config = KylinConfig.createKylinConfig(prop);
} catch (IOException e) {
throw new RuntimeException(e);
}
kylinConfigCache.put(uri, config);
return config;
}
protected void attachTableMetadata(TableDesc table, Configuration conf) throws IOException {
Set dumpList = new LinkedHashSet<>();
dumpList.add(table.getResourcePath());
dumpKylinPropsAndMetadata(table.getProject(), dumpList, KylinConfig.getInstanceFromEnv(), conf);
}
protected void attachCubeMetadata(CubeInstance cube, Configuration conf) throws IOException {
dumpKylinPropsAndMetadata(cube.getProject(), collectCubeMetadata(cube), cube.getConfig(),
conf);
}
protected void attachCubeMetadataWithDict(CubeInstance cube, Configuration conf) throws IOException {
Set dumpList = new LinkedHashSet<>(collectCubeMetadata(cube));
for (CubeSegment segment : cube.getSegments()) {
dumpList.addAll(segment.getDictionaryPaths());
}
dumpKylinPropsAndMetadata(cube.getProject(), dumpList, cube.getConfig(), conf);
}
protected void attachSegmentsMetadataWithDict(List segments, Configuration conf) throws IOException {
CubeInstance cube = segments.get(0).getCubeInstance();
Set dumpList = new LinkedHashSet<>(collectCubeMetadata(cube));
for (CubeSegment segment : segments) {
dumpList.addAll(segment.getDictionaryPaths());
}
dumpKylinPropsAndMetadata(cube.getProject(), dumpList, cube.getConfig(), conf);
}
protected void attachSegmentsMetadataWithDict(List segments, String metaUrl) throws IOException {
Set dumpList = new LinkedHashSet<>(JobRelatedMetaUtil.collectCubeMetadata(segments.get(0).getCubeInstance()));
for (CubeSegment segment : segments) {
dumpList.addAll(segment.getDictionaryPaths());
dumpList.add(segment.getStatisticsResourcePath());
}
JobRelatedMetaUtil.dumpAndUploadKylinPropsAndMetadata(dumpList, (KylinConfigExt) segments.get(0).getConfig(), metaUrl);
}
protected void attachSegmentMetadataWithDict(CubeSegment segment, Configuration conf) throws IOException {
attachSegmentMetadata(segment, conf, true, false);
}
protected void attachSegmentMetadataWithAll(CubeSegment segment, Configuration conf) throws IOException {
attachSegmentMetadata(segment, conf, true, true);
}
protected void attachSegmentMetadata(CubeSegment segment, Configuration conf, boolean ifDictIncluded,
boolean ifStatsIncluded) throws IOException {
Set dumpList = new LinkedHashSet<>(collectCubeMetadata(segment.getCubeInstance()));
if (ifDictIncluded) {
dumpList.addAll(segment.getDictionaryPaths());
}
if (ifStatsIncluded) {
dumpList.add(segment.getStatisticsResourcePath());
}
dumpKylinPropsAndMetadata(segment.getProject(), dumpList, segment.getConfig(), conf);
}
protected void dumpKylinPropsAndMetadata(String prj, Set dumpList, KylinConfig kylinConfig,
Configuration conf) throws IOException {
File tmp = File.createTempFile("kylin_job_meta", "");
FileUtils.forceDelete(tmp); // we need a directory, so delete the file first
File metaDir = new File(tmp, "meta");
metaDir.mkdirs();
// write kylin.properties
File kylinPropsFile = new File(metaDir, "kylin.properties");
kylinConfig.exportToFile(kylinPropsFile);
if (prj != null) {
dumpList.add(ProjectManager.getInstance(kylinConfig).getProject(prj).getResourcePath());
}
if (prj != null) {
dumpList.add(ProjectManager.getInstance(kylinConfig).getProject(prj).getResourcePath());
}
// write resources
JobRelatedMetaUtil.dumpResources(kylinConfig, metaDir, dumpList);
// hadoop distributed cache
String hdfsMetaDir = OptionsHelper.convertToFileURL(metaDir.getAbsolutePath());
if (hdfsMetaDir.startsWith("/")) // note Path on windows is like "d:/../..."
hdfsMetaDir = "file://" + hdfsMetaDir;
else
hdfsMetaDir = "file:///" + hdfsMetaDir;
logger.info("HDFS meta dir is: " + hdfsMetaDir);
appendTmpFiles(hdfsMetaDir, conf);
}
protected void cleanupTempConfFile(Configuration conf) {
String[] tempfiles = StringUtils.split(conf.get("tmpfiles"), ",");
if (tempfiles == null) {
return;
}
for (String tempMetaFileString : tempfiles) {
logger.trace("tempMetaFileString is : " + tempMetaFileString);
if (tempMetaFileString != null) {
if (tempMetaFileString.startsWith("file://")) {
tempMetaFileString = tempMetaFileString.substring("file://".length());
File tempMetaFile = new File(tempMetaFileString);
if (tempMetaFile.exists()) {
try {
FileUtils.forceDelete(tempMetaFile.getParentFile());
} catch (IOException e) {
logger.warn("error when deleting " + tempMetaFile, e);
}
} else {
logger.info("" + tempMetaFileString + " does not exist");
}
} else {
logger.info("tempMetaFileString is not starting with file:// :" + tempMetaFileString);
}
}
}
}
protected void deletePath(Configuration conf, Path path) throws IOException {
HadoopUtil.deletePath(conf, path);
}
public static double getTotalMapInputMB(Job job)
throws ClassNotFoundException, IOException, InterruptedException, JobException {
if (job == null) {
throw new JobException("Job is null");
}
long mapInputBytes = 0;
InputFormat, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
for (InputSplit split : input.getSplits(job)) {
mapInputBytes += split.getLength();
}
// 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470)
if (mapInputBytes == 0) {
logger.warn("Map input splits are 0 bytes, something is wrong?");
}
double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
return totalMapInputMB;
}
protected double getTotalMapInputMB()
throws ClassNotFoundException, IOException, InterruptedException, JobException {
return getTotalMapInputMB(job);
}
protected int getMapInputSplitCount()
throws ClassNotFoundException, JobException, IOException, InterruptedException {
if (job == null) {
throw new JobException("Job is null");
}
InputFormat, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
return input.getSplits(job).size();
}
public void kill() throws JobException {
if (job != null) {
try {
job.killJob();
} catch (IOException e) {
throw new JobException(e);
}
}
}
public Map getInfo() throws JobException {
if (job != null) {
Map status = new HashMap();
if (null != job.getJobID()) {
status.put(JobInstance.MR_JOB_ID, job.getJobID().toString());
}
if (null != job.getTrackingURL()) {
status.put(JobInstance.YARN_APP_URL, job.getTrackingURL().toString());
}
return status;
} else {
throw new JobException("Job is null");
}
}
public Counters getCounters() throws JobException {
if (job != null) {
try {
return job.getCounters();
} catch (IOException e) {
throw new JobException(e);
}
} else {
throw new JobException("Job is null");
}
}
public void setAsync(boolean isAsync) {
this.isAsync = isAsync;
}
public Job getJob() {
return this.job;
}
// tells MapReduceExecutable to skip this job
public boolean isSkipped() {
return false;
}
@Override
public void setConf(Configuration conf) {
Configuration healSickConf = HadoopUtil.healSickConfig(conf);
super.setConf(healSickConf);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy