org.apache.hadoop.hive.ql.exec.Utilities Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.beans.DefaultPersistenceDelegate;
import java.beans.Encoder;
import java.beans.ExceptionListener;
import java.beans.Expression;
import java.beans.PersistenceDelegate;
import java.beans.Statement;
import java.beans.XMLDecoder;
import java.beans.XMLEncoder;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URL;
import java.net.URLClassLoader;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.SQLTransientException;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;
import java.util.zip.InflaterInputStream;
import org.antlr.runtime.CommonToken;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.WordUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.common.HiveInterruptCallback;
import org.apache.hadoop.hive.common.HiveInterruptUtils;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
import org.apache.hadoop.hive.ql.exec.mr.ExecMapper;
import org.apache.hadoop.hive.ql.exec.mr.ExecReducer;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.exec.tez.TezTask;
import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
import org.apache.hadoop.hive.ql.io.rcfile.merge.MergeWork;
import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileMergeMapper;
import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanMapper;
import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork;
import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateMapper;
import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateWork;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.metadata.InputEstimator;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.PlanUtils.ExpressionTypes;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.Adjacency;
import org.apache.hadoop.hive.ql.plan.api.Graph;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.stats.StatsFactory;
import org.apache.hadoop.hive.ql.stats.StatsPublisher;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Shell;
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.esotericsoftware.kryo.serializers.FieldSerializer;
import com.esotericsoftware.shaded.org.objenesis.strategy.StdInstantiatorStrategy;
/**
* Utilities.
*
*/
@SuppressWarnings("nls")
public final class Utilities {
/**
* The object in the reducer are composed of these top level fields.
*/
public static String HADOOP_LOCAL_FS = "file:///";
public static String MAP_PLAN_NAME = "map.xml";
public static String REDUCE_PLAN_NAME = "reduce.xml";
public static final String MAPRED_MAPPER_CLASS = "mapred.mapper.class";
public static final String MAPRED_REDUCER_CLASS = "mapred.reducer.class";
/**
* ReduceField:
* KEY: record key
* VALUE: record value
*/
public static enum ReduceField {
KEY, VALUE
};
public static List reduceFieldNameList;
static {
reduceFieldNameList = new ArrayList();
for (ReduceField r : ReduceField.values()) {
reduceFieldNameList.add(r.toString());
}
}
private Utilities() {
// prevent instantiation
}
private static Map gWorkMap = Collections
.synchronizedMap(new HashMap());
private static final String CLASS_NAME = Utilities.class.getName();
private static final Log LOG = LogFactory.getLog(CLASS_NAME);
public static void clearWork(Configuration conf) {
Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
// if the plan path hasn't been initialized just return, nothing to clean.
if (mapPath == null || reducePath == null) {
return;
}
try {
FileSystem fs = mapPath.getFileSystem(conf);
if (fs.exists(mapPath)) {
fs.delete(mapPath, true);
}
if (fs.exists(reducePath)) {
fs.delete(reducePath, true);
}
} catch (Exception e) {
LOG.warn("Failed to clean-up tmp directories.", e);
} finally {
// where a single process works with multiple plans - we must clear
// the cache before working with the next plan.
if (mapPath != null) {
gWorkMap.remove(mapPath);
}
if (reducePath != null) {
gWorkMap.remove(reducePath);
}
}
}
public static MapredWork getMapRedWork(Configuration conf) {
MapredWork w = new MapredWork();
w.setMapWork(getMapWork(conf));
w.setReduceWork(getReduceWork(conf));
return w;
}
public static void setMapWork(Configuration conf, MapWork work) {
setBaseWork(conf, MAP_PLAN_NAME, work);
}
public static MapWork getMapWork(Configuration conf) {
return (MapWork) getBaseWork(conf, MAP_PLAN_NAME);
}
public static void setReduceWork(Configuration conf, ReduceWork work) {
setBaseWork(conf, REDUCE_PLAN_NAME, work);
}
public static ReduceWork getReduceWork(Configuration conf) {
return (ReduceWork) getBaseWork(conf, REDUCE_PLAN_NAME);
}
/**
* Pushes work into the global work map
*/
public static void setBaseWork(Configuration conf, String name, BaseWork work) {
Path path = getPlanPath(conf, name);
gWorkMap.put(path, work);
}
/**
* Returns the Map or Reduce plan
* Side effect: the BaseWork returned is also placed in the gWorkMap
* @param conf
* @param name
* @return BaseWork based on the name supplied will return null if name is null
* @throws RuntimeException if the configuration files are not proper or if plan can not be loaded
*/
private static BaseWork getBaseWork(Configuration conf, String name) {
BaseWork gWork = null;
Path path = null;
InputStream in = null;
try {
path = getPlanPath(conf, name);
assert path != null;
if (!gWorkMap.containsKey(path)) {
Path localPath;
if (ShimLoader.getHadoopShims().isLocalMode(conf)) {
localPath = path;
} else {
localPath = new Path(name);
}
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
LOG.debug("Loading plan from string: "+path.toUri().getPath());
String planString = conf.get(path.toUri().getPath());
if (planString == null) {
LOG.info("Could not find plan string in conf");
return null;
}
byte[] planBytes = Base64.decodeBase64(planString);
in = new ByteArrayInputStream(planBytes);
in = new InflaterInputStream(in);
} else {
in = new FileInputStream(localPath.toUri().getPath());
}
if(MAP_PLAN_NAME.equals(name)){
if (ExecMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))){
gWork = deserializePlan(in, MapWork.class, conf);
} else if(RCFileMergeMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
gWork = deserializePlan(in, MergeWork.class, conf);
} else if(ColumnTruncateMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
gWork = deserializePlan(in, ColumnTruncateWork.class, conf);
} else if(PartialScanMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
gWork = deserializePlan(in, PartialScanWork.class,conf);
} else {
throw new RuntimeException("unable to determine work from configuration ."
+ MAPRED_MAPPER_CLASS + " was "+ conf.get(MAPRED_MAPPER_CLASS)) ;
}
} else if (REDUCE_PLAN_NAME.equals(name)) {
if(ExecReducer.class.getName().equals(conf.get(MAPRED_REDUCER_CLASS))) {
gWork = deserializePlan(in, ReduceWork.class, conf);
} else {
throw new RuntimeException("unable to determine work from configuration ."
+ MAPRED_REDUCER_CLASS +" was "+ conf.get(MAPRED_REDUCER_CLASS)) ;
}
}
gWorkMap.put(path, gWork);
} else {
LOG.debug("Found plan in cache.");
gWork = gWorkMap.get(path);
}
return gWork;
} catch (FileNotFoundException fnf) {
// happens. e.g.: no reduce work.
LOG.info("No plan file found: "+path);
return null;
} catch (Exception e) {
LOG.error("Failed to load plan: "+path, e);
throw new RuntimeException(e);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException cantBlameMeForTrying) { }
}
}
}
public static void setWorkflowAdjacencies(Configuration conf, QueryPlan plan) {
try {
Graph stageGraph = plan.getQueryPlan().getStageGraph();
if (stageGraph == null) {
return;
}
List adjList = stageGraph.getAdjacencyList();
if (adjList == null) {
return;
}
for (Adjacency adj : adjList) {
List children = adj.getChildren();
if (children == null || children.isEmpty()) {
return;
}
conf.setStrings("mapreduce.workflow.adjacency."+adj.getNode(),
children.toArray(new String[children.size()]));
}
} catch (IOException e) {
}
}
public static List getFieldSchemaString(List fl) {
if (fl == null) {
return null;
}
ArrayList ret = new ArrayList();
for (FieldSchema f : fl) {
ret.add(f.getName() + " " + f.getType()
+ (f.getComment() != null ? (" " + f.getComment()) : ""));
}
return ret;
}
/**
* Java 1.5 workaround. From http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=5015403
*/
public static class EnumDelegate extends DefaultPersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
return new Expression(Enum.class, "valueOf", new Object[] {oldInstance.getClass(),
((Enum>) oldInstance).name()});
}
@Override
protected boolean mutatesTo(Object oldInstance, Object newInstance) {
return oldInstance == newInstance;
}
}
public static class MapDelegate extends DefaultPersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
Map oldMap = (Map) oldInstance;
HashMap newMap = new HashMap(oldMap);
return new Expression(newMap, HashMap.class, "new", new Object[] {});
}
@Override
protected boolean mutatesTo(Object oldInstance, Object newInstance) {
return false;
}
@Override
protected void initialize(Class> type, Object oldInstance, Object newInstance, Encoder out) {
java.util.Collection oldO = (java.util.Collection) oldInstance;
java.util.Collection newO = (java.util.Collection) newInstance;
if (newO.size() != 0) {
out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
}
for (Iterator i = oldO.iterator(); i.hasNext();) {
out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
}
}
}
public static class SetDelegate extends DefaultPersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
Set oldSet = (Set) oldInstance;
HashSet newSet = new HashSet(oldSet);
return new Expression(newSet, HashSet.class, "new", new Object[] {});
}
@Override
protected boolean mutatesTo(Object oldInstance, Object newInstance) {
return false;
}
@Override
protected void initialize(Class> type, Object oldInstance, Object newInstance, Encoder out) {
java.util.Collection oldO = (java.util.Collection) oldInstance;
java.util.Collection newO = (java.util.Collection) newInstance;
if (newO.size() != 0) {
out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
}
for (Iterator i = oldO.iterator(); i.hasNext();) {
out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
}
}
}
public static class ListDelegate extends DefaultPersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
List oldList = (List) oldInstance;
ArrayList newList = new ArrayList(oldList);
return new Expression(newList, ArrayList.class, "new", new Object[] {});
}
@Override
protected boolean mutatesTo(Object oldInstance, Object newInstance) {
return false;
}
@Override
protected void initialize(Class> type, Object oldInstance, Object newInstance, Encoder out) {
java.util.Collection oldO = (java.util.Collection) oldInstance;
java.util.Collection newO = (java.util.Collection) newInstance;
if (newO.size() != 0) {
out.writeStatement(new Statement(oldInstance, "clear", new Object[] {}));
}
for (Iterator i = oldO.iterator(); i.hasNext();) {
out.writeStatement(new Statement(oldInstance, "add", new Object[] {i.next()}));
}
}
}
/**
* DatePersistenceDelegate. Needed to serialize java.util.Date
* since it is not serialization friendly.
* Also works for java.sql.Date since it derives from java.util.Date.
*/
public static class DatePersistenceDelegate extends PersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
Date dateVal = (Date)oldInstance;
Object[] args = { dateVal.getTime() };
return new Expression(dateVal, dateVal.getClass(), "new", args);
}
@Override
protected boolean mutatesTo(Object oldInstance, Object newInstance) {
if (oldInstance == null || newInstance == null) {
return false;
}
return oldInstance.getClass() == newInstance.getClass();
}
}
/**
* TimestampPersistenceDelegate. Needed to serialize java.sql.Timestamp since
* it is not serialization friendly.
*/
public static class TimestampPersistenceDelegate extends DatePersistenceDelegate {
@Override
protected void initialize(Class> type, Object oldInstance, Object newInstance, Encoder out) {
Timestamp ts = (Timestamp)oldInstance;
Object[] args = { ts.getNanos() };
Statement stmt = new Statement(oldInstance, "setNanos", args);
out.writeStatement(stmt);
}
}
/**
* Need to serialize org.antlr.runtime.CommonToken
*/
public static class CommonTokenDelegate extends PersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
CommonToken ct = (CommonToken)oldInstance;
Object[] args = {ct.getType(), ct.getText()};
return new Expression(ct, ct.getClass(), "new", args);
}
}
public static class PathDelegate extends PersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
Path p = (Path)oldInstance;
Object[] args = {p.toString()};
return new Expression(p, p.getClass(), "new", args);
}
}
public static void setMapRedWork(Configuration conf, MapredWork w, Path hiveScratchDir) {
setMapWork(conf, w.getMapWork(), hiveScratchDir, true);
if (w.getReduceWork() != null) {
setReduceWork(conf, w.getReduceWork(), hiveScratchDir, true);
}
}
public static Path setMapWork(Configuration conf, MapWork w, Path hiveScratchDir, boolean useCache) {
return setBaseWork(conf, w, hiveScratchDir, MAP_PLAN_NAME, useCache);
}
public static Path setReduceWork(Configuration conf, ReduceWork w, Path hiveScratchDir, boolean useCache) {
return setBaseWork(conf, w, hiveScratchDir, REDUCE_PLAN_NAME, useCache);
}
private static Path setBaseWork(Configuration conf, BaseWork w, Path hiveScratchDir, String name, boolean useCache) {
try {
setPlanPath(conf, hiveScratchDir);
Path planPath = getPlanPath(conf, name);
OutputStream out;
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
// add it to the conf
ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
out = new DeflaterOutputStream(byteOut, new Deflater(Deflater.BEST_SPEED));
serializePlan(w, out, conf);
LOG.info("Setting plan: "+planPath.toUri().getPath());
conf.set(planPath.toUri().getPath(),
Base64.encodeBase64String(byteOut.toByteArray()));
} else {
// use the default file system of the conf
FileSystem fs = planPath.getFileSystem(conf);
out = fs.create(planPath);
serializePlan(w, out, conf);
// Serialize the plan to the default hdfs instance
// Except for hadoop local mode execution where we should be
// able to get the plan directly from the cache
if (useCache && !ShimLoader.getHadoopShims().isLocalMode(conf)) {
// Set up distributed cache
if (!DistributedCache.getSymlink(conf)) {
DistributedCache.createSymlink(conf);
}
String uriWithLink = planPath.toUri().toString() + "#" + name;
DistributedCache.addCacheFile(new URI(uriWithLink), conf);
// set replication of the plan file to a high number. we use the same
// replication factor as used by the hadoop jobclient for job.xml etc.
short replication = (short) conf.getInt("mapred.submit.replication", 10);
fs.setReplication(planPath, replication);
}
}
// Cache the plan in this process
gWorkMap.put(planPath, w);
return planPath;
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
private static Path getPlanPath(Configuration conf, String name) {
Path planPath = getPlanPath(conf);
if (planPath == null) {
return null;
}
return new Path(planPath, name);
}
private static void setPlanPath(Configuration conf, Path hiveScratchDir) throws IOException {
if (getPlanPath(conf) == null) {
// this is the unique conf ID, which is kept in JobConf as part of the plan file name
String jobID = UUID.randomUUID().toString();
Path planPath = new Path(hiveScratchDir, jobID);
FileSystem fs = planPath.getFileSystem(conf);
fs.mkdirs(planPath);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, planPath.toUri().toString());
}
}
public static Path getPlanPath(Configuration conf) {
String plan = HiveConf.getVar(conf, HiveConf.ConfVars.PLAN);
if (plan != null && !plan.isEmpty()) {
return new Path(plan);
}
return null;
}
/**
* Serializes expression via Kryo.
* @param expr Expression.
* @return Bytes.
*/
public static byte[] serializeExpressionToKryo(ExprNodeGenericFuncDesc expr) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Output output = new Output(baos);
runtimeSerializationKryo.get().writeObject(output, expr);
output.close();
return baos.toByteArray();
}
/**
* Deserializes expression from Kryo.
* @param bytes Bytes containing the expression.
* @return Expression; null if deserialization succeeded, but the result type is incorrect.
*/
public static ExprNodeGenericFuncDesc deserializeExpressionFromKryo(byte[] bytes) {
Input inp = new Input(new ByteArrayInputStream(bytes));
ExprNodeGenericFuncDesc func = runtimeSerializationKryo.get().
readObject(inp,ExprNodeGenericFuncDesc.class);
inp.close();
return func;
}
public static String serializeExpression(ExprNodeGenericFuncDesc expr) {
try {
return new String(Base64.encodeBase64(serializeExpressionToKryo(expr)), "UTF-8");
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("UTF-8 support required", ex);
}
}
public static ExprNodeGenericFuncDesc deserializeExpression(String s) {
byte[] bytes;
try {
bytes = Base64.decodeBase64(s.getBytes("UTF-8"));
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("UTF-8 support required", ex);
}
return deserializeExpressionFromKryo(bytes);
}
public static class CollectionPersistenceDelegate extends DefaultPersistenceDelegate {
@Override
protected Expression instantiate(Object oldInstance, Encoder out) {
return new Expression(oldInstance, oldInstance.getClass(), "new", null);
}
@Override
protected void initialize(Class type, Object oldInstance, Object newInstance, Encoder out) {
Iterator ite = ((Collection) oldInstance).iterator();
while (ite.hasNext()) {
out.writeStatement(new Statement(oldInstance, "add", new Object[] {ite.next()}));
}
}
}
/**
* Kryo serializer for timestamp.
*/
private static class TimestampSerializer extends
com.esotericsoftware.kryo.Serializer {
@Override
public Timestamp read(Kryo kryo, Input input, Class clazz) {
Timestamp ts = new Timestamp(input.readLong());
ts.setNanos(input.readInt());
return ts;
}
@Override
public void write(Kryo kryo, Output output, Timestamp ts) {
output.writeLong(ts.getTime());
output.writeInt(ts.getNanos());
}
}
/** Custom Kryo serializer for sql date, otherwise Kryo gets confused between
java.sql.Date and java.util.Date while deserializing
*/
private static class SqlDateSerializer extends
com.esotericsoftware.kryo.Serializer {
@Override
public java.sql.Date read(Kryo kryo, Input input, Class clazz) {
return new java.sql.Date(input.readLong());
}
@Override
public void write(Kryo kryo, Output output, java.sql.Date sqlDate) {
output.writeLong(sqlDate.getTime());
}
}
private static class CommonTokenSerializer extends com.esotericsoftware.kryo.Serializer {
@Override
public CommonToken read(Kryo kryo, Input input, Class clazz) {
return new CommonToken(input.readInt(), input.readString());
}
@Override
public void write(Kryo kryo, Output output, CommonToken token) {
output.writeInt(token.getType());
output.writeString(token.getText());
}
}
private static class PathSerializer extends com.esotericsoftware.kryo.Serializer {
@Override
public void write(Kryo kryo, Output output, Path path) {
output.writeString(path.toUri().toString());
}
@Override
public Path read(Kryo kryo, Input input, Class type) {
return new Path(URI.create(input.readString()));
}
}
public static Set> cloneOperatorTree(Configuration conf, Set> roots) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
serializePlan(roots, baos, conf, true);
Set> result = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
roots.getClass(), conf, true);
return result;
}
private static void serializePlan(Object plan, OutputStream out, Configuration conf, boolean cloningPlan) {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
String serializationType = conf.get(HiveConf.ConfVars.PLAN_SERIALIZATION.varname, "kryo");
LOG.info("Serializing " + plan.getClass().getSimpleName() + " via " + serializationType);
if("javaXML".equalsIgnoreCase(serializationType)) {
serializeObjectByJavaXML(plan, out);
} else {
if(cloningPlan) {
serializeObjectByKryo(cloningQueryPlanKryo.get(), plan, out);
} else {
serializeObjectByKryo(runtimeSerializationKryo.get(), plan, out);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
}
/**
* Serializes the plan.
* @param plan The plan, such as QueryPlan, MapredWork, etc.
* @param out The stream to write to.
* @param conf to pick which serialization format is desired.
*/
public static void serializePlan(Object plan, OutputStream out, Configuration conf) {
serializePlan(plan, out, conf, false);
}
private static T deserializePlan(InputStream in, Class planClass, Configuration conf, boolean cloningPlan) {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
T plan;
String serializationType = conf.get(HiveConf.ConfVars.PLAN_SERIALIZATION.varname, "kryo");
LOG.info("Deserializing " + planClass.getSimpleName() + " via " + serializationType);
if("javaXML".equalsIgnoreCase(serializationType)) {
plan = deserializeObjectByJavaXML(in);
} else {
if(cloningPlan) {
plan = deserializeObjectByKryo(cloningQueryPlanKryo.get(), in, planClass);
} else {
plan = deserializeObjectByKryo(runtimeSerializationKryo.get(), in, planClass);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
return plan;
}
/**
* Deserializes the plan.
* @param in The stream to read from.
* @param planClass class of plan
* @param conf configuration
* @return The plan, such as QueryPlan, MapredWork, etc.
*/
public static T deserializePlan(InputStream in, Class planClass, Configuration conf) {
return deserializePlan(in, planClass, conf, false);
}
/**
* Clones using the powers of XML. Do not use unless necessary.
* @param plan The plan.
* @return The clone.
*/
public static MapredWork clonePlan(MapredWork plan) {
// TODO: need proper clone. Meanwhile, let's at least keep this horror in one place
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
Configuration conf = new HiveConf();
serializePlan(plan, baos, conf, true);
MapredWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
MapredWork.class, conf, true);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
return newPlan;
}
/**
* Serialize the object. This helper function mainly makes sure that enums,
* counters, etc are handled properly.
*/
private static void serializeObjectByJavaXML(Object plan, OutputStream out) {
XMLEncoder e = new XMLEncoder(out);
e.setExceptionListener(new ExceptionListener() {
@Override
public void exceptionThrown(Exception e) {
LOG.warn(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new RuntimeException("Cannot serialize object", e);
}
});
// workaround for java 1.5
e.setPersistenceDelegate(ExpressionTypes.class, new EnumDelegate());
e.setPersistenceDelegate(GroupByDesc.Mode.class, new EnumDelegate());
e.setPersistenceDelegate(java.sql.Date.class, new DatePersistenceDelegate());
e.setPersistenceDelegate(Timestamp.class, new TimestampPersistenceDelegate());
e.setPersistenceDelegate(org.datanucleus.store.types.backed.Map.class, new MapDelegate());
e.setPersistenceDelegate(org.datanucleus.store.types.backed.List.class, new ListDelegate());
e.setPersistenceDelegate(CommonToken.class, new CommonTokenDelegate());
e.setPersistenceDelegate(Path.class, new PathDelegate());
e.writeObject(plan);
e.close();
}
/**
* @param plan Usually of type MapredWork, MapredLocalWork etc.
* @param out stream in which serialized plan is written into
*/
private static void serializeObjectByKryo(Kryo kryo, Object plan, OutputStream out) {
Output output = new Output(out);
kryo.writeObject(output, plan);
output.close();
}
/**
* De-serialize an object. This helper function mainly makes sure that enums,
* counters, etc are handled properly.
*/
@SuppressWarnings("unchecked")
private static T deserializeObjectByJavaXML(InputStream in) {
XMLDecoder d = null;
try {
d = new XMLDecoder(in, null, null);
return (T) d.readObject();
} finally {
if (null != d) {
d.close();
}
}
}
private static T deserializeObjectByKryo(Kryo kryo, InputStream in, Class clazz ) {
Input inp = new Input(in);
T t = kryo.readObject(inp,clazz);
inp.close();
return t;
}
// Kryo is not thread-safe,
// Also new Kryo() is expensive, so we want to do it just once.
public static ThreadLocal runtimeSerializationKryo = new ThreadLocal() {
@Override
protected synchronized Kryo initialValue() {
Kryo kryo = new Kryo();
kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
kryo.register(java.sql.Date.class, new SqlDateSerializer());
kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
kryo.register(Path.class, new PathSerializer());
kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
removeField(kryo, Operator.class, "colExprMap");
removeField(kryo, ColumnInfo.class, "objectInspector");
removeField(kryo, MapWork.class, "opParseCtxMap");
removeField(kryo, MapWork.class, "joinTree");
return kryo;
};
};
@SuppressWarnings("rawtypes")
protected static void removeField(Kryo kryo, Class type, String fieldName) {
FieldSerializer fld = new FieldSerializer(kryo, type);
fld.removeField(fieldName);
kryo.register(type, fld);
}
private static ThreadLocal cloningQueryPlanKryo = new ThreadLocal() {
@Override
protected synchronized Kryo initialValue() {
Kryo kryo = new Kryo();
kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
kryo.register(CommonToken.class, new CommonTokenSerializer());
kryo.register(java.sql.Date.class, new SqlDateSerializer());
kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
kryo.register(Path.class, new PathSerializer());
kryo.setInstantiatorStrategy(new StdInstantiatorStrategy());
return kryo;
};
};
public static TableDesc defaultTd;
static {
// by default we expect ^A separated strings
// This tableDesc does not provide column names. We should always use
// PlanUtils.getDefaultTableDesc(String separatorCode, String columns)
// or getBinarySortableTableDesc(List fieldSchemas) when
// we know the column names.
defaultTd = PlanUtils.getDefaultTableDesc("" + Utilities.ctrlaCode);
}
public static final int carriageReturnCode = 13;
public static final int newLineCode = 10;
public static final int tabCode = 9;
public static final int ctrlaCode = 1;
public static final String INDENT = " ";
// Note: When DDL supports specifying what string to represent null,
// we should specify "NULL" to represent null in the temp table, and then
// we can make the following translation deprecated.
public static String nullStringStorage = "\\N";
public static String nullStringOutput = "NULL";
public static Random randGen = new Random();
/**
* Gets the task id if we are running as a Hadoop job. Gets a random number otherwise.
*/
public static String getTaskId(Configuration hconf) {
String taskid = (hconf == null) ? null : hconf.get("mapred.task.id");
if ((taskid == null) || taskid.equals("")) {
return ("" + Math.abs(randGen.nextInt()));
} else {
/*
* extract the task and attempt id from the hadoop taskid. in version 17 the leading component
* was 'task_'. thereafter the leading component is 'attempt_'. in 17 - hadoop also seems to
* have used _map_ and _reduce_ to denote map/reduce task types
*/
String ret = taskid.replaceAll(".*_[mr]_", "").replaceAll(".*_(map|reduce)_", "");
return (ret);
}
}
public static HashMap makeMap(Object... olist) {
HashMap ret = new HashMap();
for (int i = 0; i < olist.length; i += 2) {
ret.put(olist[i], olist[i + 1]);
}
return (ret);
}
public static Properties makeProperties(String... olist) {
Properties ret = new Properties();
for (int i = 0; i < olist.length; i += 2) {
ret.setProperty(olist[i], olist[i + 1]);
}
return (ret);
}
public static ArrayList makeList(Object... olist) {
ArrayList ret = new ArrayList();
for (Object element : olist) {
ret.add(element);
}
return (ret);
}
/**
* StreamPrinter.
*
*/
public static class StreamPrinter extends Thread {
InputStream is;
String type;
PrintStream os;
public StreamPrinter(InputStream is, String type, PrintStream os) {
this.is = is;
this.type = type;
this.os = os;
}
@Override
public void run() {
BufferedReader br = null;
try {
InputStreamReader isr = new InputStreamReader(is);
br = new BufferedReader(isr);
String line = null;
if (type != null) {
while ((line = br.readLine()) != null) {
os.println(type + ">" + line);
}
} else {
while ((line = br.readLine()) != null) {
os.println(line);
}
}
br.close();
br=null;
} catch (IOException ioe) {
ioe.printStackTrace();
}finally{
IOUtils.closeStream(br);
}
}
}
public static TableDesc getTableDesc(Table tbl) {
Properties props = tbl.getMetadata();
props.put(serdeConstants.SERIALIZATION_LIB, tbl.getDeserializer().getClass().getName());
return (new TableDesc(tbl.getInputFormatClass(), tbl
.getOutputFormatClass(), props));
}
// column names and column types are all delimited by comma
public static TableDesc getTableDesc(String cols, String colTypes) {
return (new TableDesc(SequenceFileInputFormat.class,
HiveSequenceFileOutputFormat.class, Utilities.makeProperties(
serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode,
serdeConstants.LIST_COLUMNS, cols,
serdeConstants.LIST_COLUMN_TYPES, colTypes,
serdeConstants.SERIALIZATION_LIB,LazySimpleSerDe.class.getName())));
}
public static PartitionDesc getPartitionDesc(Partition part) throws HiveException {
return (new PartitionDesc(part));
}
public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part)
throws HiveException {
return new PartitionDesc(part, tblDesc);
}
private static String getOpTreeSkel_helper(Operator> op, String indent) {
if (op == null) {
return "";
}
StringBuilder sb = new StringBuilder();
sb.append(indent);
sb.append(op.toString());
sb.append("\n");
if (op.getChildOperators() != null) {
for (Object child : op.getChildOperators()) {
sb.append(getOpTreeSkel_helper((Operator>) child, indent + " "));
}
}
return sb.toString();
}
public static String getOpTreeSkel(Operator> op) {
return getOpTreeSkel_helper(op, "");
}
private static boolean isWhitespace(int c) {
if (c == -1) {
return false;
}
return Character.isWhitespace((char) c);
}
public static boolean contentsEqual(InputStream is1, InputStream is2, boolean ignoreWhitespace)
throws IOException {
try {
if ((is1 == is2) || (is1 == null && is2 == null)) {
return true;
}
if (is1 == null || is2 == null) {
return false;
}
while (true) {
int c1 = is1.read();
while (ignoreWhitespace && isWhitespace(c1)) {
c1 = is1.read();
}
int c2 = is2.read();
while (ignoreWhitespace && isWhitespace(c2)) {
c2 = is2.read();
}
if (c1 == -1 && c2 == -1) {
return true;
}
if (c1 != c2) {
break;
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return false;
}
/**
* convert "From src insert blah blah" to "From src insert ... blah"
*/
public static String abbreviate(String str, int max) {
str = str.trim();
int len = str.length();
int suffixlength = 20;
if (len <= max) {
return str;
}
suffixlength = Math.min(suffixlength, (max - 3) / 2);
String rev = StringUtils.reverse(str);
// get the last few words
String suffix = WordUtils.abbreviate(rev, 0, suffixlength, "");
suffix = StringUtils.reverse(suffix);
// first few ..
String prefix = StringUtils.abbreviate(str, max - suffix.length());
return prefix + suffix;
}
public static final String NSTR = "";
/**
* StreamStatus.
*
*/
public static enum StreamStatus {
EOF, TERMINATED
}
public static StreamStatus readColumn(DataInput in, OutputStream out) throws IOException {
boolean foundCrChar = false;
while (true) {
int b;
try {
b = in.readByte();
} catch (EOFException e) {
return StreamStatus.EOF;
}
// Default new line characters on windows are "CRLF" so detect if there are any windows
// native newline characters and handle them.
if (Shell.WINDOWS) {
// if the CR is not followed by the LF on windows then add it back to the stream and
// proceed with next characters in the input stream.
if (foundCrChar && b != Utilities.newLineCode) {
out.write(Utilities.carriageReturnCode);
foundCrChar = false;
}
if (b == Utilities.carriageReturnCode) {
foundCrChar = true;
continue;
}
}
if (b == Utilities.newLineCode) {
return StreamStatus.TERMINATED;
}
out.write(b);
}
// Unreachable
}
/**
* Convert an output stream to a compressed output stream based on codecs and compression options
* specified in the Job Configuration.
*
* @param jc
* Job Configuration
* @param out
* Output Stream to be converted into compressed output stream
* @return compressed output stream
*/
public static OutputStream createCompressedStream(JobConf jc, OutputStream out)
throws IOException {
boolean isCompressed = FileOutputFormat.getCompressOutput(jc);
return createCompressedStream(jc, out, isCompressed);
}
/**
* Convert an output stream to a compressed output stream based on codecs codecs in the Job
* Configuration. Caller specifies directly whether file is compressed or not
*
* @param jc
* Job Configuration
* @param out
* Output Stream to be converted into compressed output stream
* @param isCompressed
* whether the output stream needs to be compressed or not
* @return compressed output stream
*/
public static OutputStream createCompressedStream(JobConf jc, OutputStream out,
boolean isCompressed) throws IOException {
if (isCompressed) {
Class extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
DefaultCodec.class);
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jc);
return codec.createOutputStream(out);
} else {
return (out);
}
}
/**
* Based on compression option and configured output codec - get extension for output file. This
* is only required for text files - not sequencefiles
*
* @param jc
* Job Configuration
* @param isCompressed
* Whether the output file is compressed or not
* @return the required file extension (example: .gz)
* @deprecated Use {@link #getFileExtension(JobConf, boolean, HiveOutputFormat)}
*/
@Deprecated
public static String getFileExtension(JobConf jc, boolean isCompressed) {
return getFileExtension(jc, isCompressed, new HiveIgnoreKeyTextOutputFormat());
}
/**
* Based on compression option, output format, and configured output codec -
* get extension for output file. Text files require an extension, whereas
* others, like sequence files, do not.
*
* The property hive.output.file.extension
is used to determine
* the extension - if set, it will override other logic for choosing an
* extension.
*
* @param jc
* Job Configuration
* @param isCompressed
* Whether the output file is compressed or not
* @param hiveOutputFormat
* The output format, used to detect if the format is text
* @return the required file extension (example: .gz)
*/
public static String getFileExtension(JobConf jc, boolean isCompressed,
HiveOutputFormat, ?> hiveOutputFormat) {
String extension = HiveConf.getVar(jc, HiveConf.ConfVars.OUTPUT_FILE_EXTENSION);
if (!StringUtils.isEmpty(extension)) {
return extension;
}
if ((hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) && isCompressed) {
Class extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
DefaultCodec.class);
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jc);
return codec.getDefaultExtension();
}
return "";
}
/**
* Create a sequencefile output stream based on job configuration.
*
* @param jc
* Job configuration
* @param fs
* File System to create file in
* @param file
* Path to be created
* @param keyClass
* Java Class for key
* @param valClass
* Java Class for value
* @return output stream over the created sequencefile
*/
public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
Class> keyClass, Class> valClass) throws IOException {
boolean isCompressed = FileOutputFormat.getCompressOutput(jc);
return createSequenceWriter(jc, fs, file, keyClass, valClass, isCompressed);
}
/**
* Create a sequencefile output stream based on job configuration Uses user supplied compression
* flag (rather than obtaining it from the Job Configuration).
*
* @param jc
* Job configuration
* @param fs
* File System to create file in
* @param file
* Path to be created
* @param keyClass
* Java Class for key
* @param valClass
* Java Class for value
* @return output stream over the created sequencefile
*/
public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
Class> keyClass, Class> valClass, boolean isCompressed) throws IOException {
CompressionCodec codec = null;
CompressionType compressionType = CompressionType.NONE;
Class codecClass = null;
if (isCompressed) {
compressionType = SequenceFileOutputFormat.getOutputCompressionType(jc);
codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, jc);
}
return (SequenceFile.createWriter(fs, jc, file, keyClass, valClass, compressionType, codec));
}
/**
* Create a RCFile output stream based on job configuration Uses user supplied compression flag
* (rather than obtaining it from the Job Configuration).
*
* @param jc
* Job configuration
* @param fs
* File System to create file in
* @param file
* Path to be created
* @return output stream over the created rcfile
*/
public static RCFile.Writer createRCFileWriter(JobConf jc, FileSystem fs, Path file,
boolean isCompressed) throws IOException {
CompressionCodec codec = null;
Class> codecClass = null;
if (isCompressed) {
codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, jc);
}
return new RCFile.Writer(fs, jc, file, null, codec);
}
/**
* Shamelessly cloned from GenericOptionsParser.
*/
public static String realFile(String newFile, Configuration conf) throws IOException {
Path path = new Path(newFile);
URI pathURI = path.toUri();
FileSystem fs;
if (pathURI.getScheme() == null) {
fs = FileSystem.getLocal(conf);
} else {
fs = path.getFileSystem(conf);
}
if (!fs.exists(path)) {
return null;
}
String file = path.makeQualified(fs).toString();
// For compatibility with hadoop 0.17, change file:/a/b/c to file:///a/b/c
if (StringUtils.startsWith(file, "file:/") && !StringUtils.startsWith(file, "file:///")) {
file = "file:///" + file.substring("file:/".length());
}
return file;
}
public static List mergeUniqElems(List src, List dest) {
if (dest == null) {
return src;
}
if (src == null) {
return dest;
}
int pos = 0;
while (pos < dest.size()) {
if (!src.contains(dest.get(pos))) {
src.add(dest.get(pos));
}
pos++;
}
return src;
}
private static final String tmpPrefix = "_tmp.";
private static final String taskTmpPrefix = "_task_tmp.";
public static Path toTaskTempPath(Path orig) {
if (orig.getName().indexOf(taskTmpPrefix) == 0) {
return orig;
}
return new Path(orig.getParent(), taskTmpPrefix + orig.getName());
}
public static Path toTempPath(Path orig) {
if (orig.getName().indexOf(tmpPrefix) == 0) {
return orig;
}
return new Path(orig.getParent(), tmpPrefix + orig.getName());
}
/**
* Given a path, convert to a temporary path.
*/
public static Path toTempPath(String orig) {
return toTempPath(new Path(orig));
}
/**
* Detect if the supplied file is a temporary path.
*/
public static boolean isTempPath(FileStatus file) {
String name = file.getPath().getName();
// in addition to detecting hive temporary files, we also check hadoop
// temporary folders that used to show up in older releases
return (name.startsWith("_task") || name.startsWith(tmpPrefix));
}
/**
* Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
* existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
*
* @param fs
* the FileSystem where src and dst are on.
* @param src
* the src directory
* @param dst
* the target directory
* @throws IOException
*/
public static void rename(FileSystem fs, Path src, Path dst) throws IOException, HiveException {
if (!fs.rename(src, dst)) {
throw new HiveException("Unable to move: " + src + " to: " + dst);
}
}
/**
* Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
* existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
*
* @param fs
* the FileSystem where src and dst are on.
* @param src
* the src directory
* @param dst
* the target directory
* @throws IOException
*/
public static void renameOrMoveFiles(FileSystem fs, Path src, Path dst) throws IOException,
HiveException {
if (!fs.exists(dst)) {
if (!fs.rename(src, dst)) {
throw new HiveException("Unable to move: " + src + " to: " + dst);
}
} else {
// move file by file
FileStatus[] files = fs.listStatus(src);
for (FileStatus file : files) {
Path srcFilePath = file.getPath();
String fileName = srcFilePath.getName();
Path dstFilePath = new Path(dst, fileName);
if (file.isDir()) {
renameOrMoveFiles(fs, srcFilePath, dstFilePath);
}
else {
if (fs.exists(dstFilePath)) {
int suffix = 0;
do {
suffix++;
dstFilePath = new Path(dst, fileName + "_" + suffix);
} while (fs.exists(dstFilePath));
}
if (!fs.rename(srcFilePath, dstFilePath)) {
throw new HiveException("Unable to move: " + src + " to: " + dst);
}
}
}
}
}
/**
* The first group will contain the task id. The second group is the optional extension. The file
* name looks like: "0_0" or "0_0.gz". There may be a leading prefix (tmp_). Since getTaskId() can
* return an integer only - this should match a pure integer as well. {1,6} is used to limit
* matching for attempts #'s 0-999999.
*/
private static final Pattern FILE_NAME_TO_TASK_ID_REGEX =
Pattern.compile("^.*?([0-9]+)(_[0-9]{1,6})?(\\..*)?$");
/**
* This retruns prefix part + taskID for bucket join for partitioned table
*/
private static final Pattern FILE_NAME_PREFIXED_TASK_ID_REGEX =
Pattern.compile("^.*?((\\(.*\\))?[0-9]+)(_[0-9]{1,6})?(\\..*)?$");
/**
* This breaks a prefixed bucket number into the prefix and the taskID
*/
private static final Pattern PREFIXED_TASK_ID_REGEX =
Pattern.compile("^(.*?\\(.*\\))?([0-9]+)$");
/**
* Get the task id from the filename. It is assumed that the filename is derived from the output
* of getTaskId
*
* @param filename
* filename to extract taskid from
*/
public static String getTaskIdFromFilename(String filename) {
return getIdFromFilename(filename, FILE_NAME_TO_TASK_ID_REGEX);
}
/**
* Get the part-spec + task id from the filename. It is assumed that the filename is derived
* from the output of getTaskId
*
* @param filename
* filename to extract taskid from
*/
public static String getPrefixedTaskIdFromFilename(String filename) {
return getIdFromFilename(filename, FILE_NAME_PREFIXED_TASK_ID_REGEX);
}
private static String getIdFromFilename(String filename, Pattern pattern) {
String taskId = filename;
int dirEnd = filename.lastIndexOf(Path.SEPARATOR);
if (dirEnd != -1) {
taskId = filename.substring(dirEnd + 1);
}
Matcher m = pattern.matcher(taskId);
if (!m.matches()) {
LOG.warn("Unable to get task id from file name: " + filename + ". Using last component"
+ taskId + " as task id.");
} else {
taskId = m.group(1);
}
LOG.debug("TaskId for " + filename + " = " + taskId);
return taskId;
}
public static String getFileNameFromDirName(String dirName) {
int dirEnd = dirName.lastIndexOf(Path.SEPARATOR);
if (dirEnd != -1) {
return dirName.substring(dirEnd + 1);
}
return dirName;
}
/**
* Replace the task id from the filename. It is assumed that the filename is derived from the
* output of getTaskId
*
* @param filename
* filename to replace taskid "0_0" or "0_0.gz" by 33 to "33_0" or "33_0.gz"
*/
public static String replaceTaskIdFromFilename(String filename, int bucketNum) {
return replaceTaskIdFromFilename(filename, String.valueOf(bucketNum));
}
public static String replaceTaskIdFromFilename(String filename, String fileId) {
String taskId = getTaskIdFromFilename(filename);
String newTaskId = replaceTaskId(taskId, fileId);
String ret = replaceTaskIdFromFilename(filename, taskId, newTaskId);
return (ret);
}
private static String replaceTaskId(String taskId, int bucketNum) {
return replaceTaskId(taskId, String.valueOf(bucketNum));
}
/**
* Returns strBucketNum with enough 0's prefixing the task ID portion of the String to make it
* equal in length to taskId
*
* @param taskId - the taskId used as a template for length
* @param strBucketNum - the bucket number of the output, may or may not be prefixed
* @return
*/
private static String replaceTaskId(String taskId, String strBucketNum) {
Matcher m = PREFIXED_TASK_ID_REGEX.matcher(strBucketNum);
if (!m.matches()) {
LOG.warn("Unable to determine bucket number from file ID: " + strBucketNum + ". Using " +
"file ID as bucket number.");
return adjustBucketNumLen(strBucketNum, taskId);
} else {
String adjustedBucketNum = adjustBucketNumLen(m.group(2), taskId);
return (m.group(1) == null ? "" : m.group(1)) + adjustedBucketNum;
}
}
/**
* Adds 0's to the beginning of bucketNum until bucketNum and taskId are the same length.
*
* @param bucketNum - the bucket number, should not be prefixed
* @param taskId - the taskId used as a template for length
* @return
*/
private static String adjustBucketNumLen(String bucketNum, String taskId) {
int bucketNumLen = bucketNum.length();
int taskIdLen = taskId.length();
StringBuffer s = new StringBuffer();
for (int i = 0; i < taskIdLen - bucketNumLen; i++) {
s.append("0");
}
return s.toString() + bucketNum;
}
/**
* Replace the oldTaskId appearing in the filename by the newTaskId. The string oldTaskId could
* appear multiple times, we should only replace the last one.
*
* @param filename
* @param oldTaskId
* @param newTaskId
* @return
*/
private static String replaceTaskIdFromFilename(String filename, String oldTaskId,
String newTaskId) {
String[] spl = filename.split(oldTaskId);
if ((spl.length == 0) || (spl.length == 1)) {
return filename.replaceAll(oldTaskId, newTaskId);
}
StringBuffer snew = new StringBuffer();
for (int idx = 0; idx < spl.length - 1; idx++) {
if (idx > 0) {
snew.append(oldTaskId);
}
snew.append(spl[idx]);
}
snew.append(newTaskId);
snew.append(spl[spl.length - 1]);
return snew.toString();
}
/**
* returns null if path is not exist
*/
public static FileStatus[] listStatusIfExists(Path path, FileSystem fs) throws IOException {
try {
return fs.listStatus(path);
} catch (FileNotFoundException e) {
// FS in hadoop 2.0 throws FNF instead of returning null
return null;
}
}
public static void mvFileToFinalPath(Path specPath, Configuration hconf,
boolean success, Log log, DynamicPartitionCtx dpCtx, FileSinkDesc conf,
Reporter reporter) throws IOException,
HiveException {
FileSystem fs = specPath.getFileSystem(hconf);
Path tmpPath = Utilities.toTempPath(specPath);
Path taskTmpPath = Utilities.toTaskTempPath(specPath);
if (success) {
if (fs.exists(tmpPath)) {
// remove any tmp file or double-committed output files
ArrayList emptyBuckets =
Utilities.removeTempOrDuplicateFiles(fs, tmpPath, dpCtx);
// create empty buckets if necessary
if (emptyBuckets.size() > 0) {
createEmptyBuckets(hconf, emptyBuckets, conf, reporter);
}
// move to the file destination
log.info("Moving tmp dir: " + tmpPath + " to: " + specPath);
Utilities.renameOrMoveFiles(fs, tmpPath, specPath);
}
} else {
fs.delete(tmpPath, true);
}
fs.delete(taskTmpPath, true);
}
/**
* Check the existence of buckets according to bucket specification. Create empty buckets if
* needed.
*
* @param hconf
* @param paths A list of empty buckets to create
* @param conf The definition of the FileSink.
* @param reporter The mapreduce reporter object
* @throws HiveException
* @throws IOException
*/
private static void createEmptyBuckets(Configuration hconf, ArrayList paths,
FileSinkDesc conf, Reporter reporter)
throws HiveException, IOException {
JobConf jc;
if (hconf instanceof JobConf) {
jc = new JobConf(hconf);
} else {
// test code path
jc = new JobConf(hconf);
}
HiveOutputFormat, ?> hiveOutputFormat = null;
Class extends Writable> outputClass = null;
boolean isCompressed = conf.getCompressed();
TableDesc tableInfo = conf.getTableInfo();
try {
Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
serializer.initialize(null, tableInfo.getProperties());
outputClass = serializer.getSerializedClass();
hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance();
} catch (SerDeException e) {
throw new HiveException(e);
} catch (InstantiationException e) {
throw new HiveException(e);
} catch (IllegalAccessException e) {
throw new HiveException(e);
}
for (String p : paths) {
Path path = new Path(p);
RecordWriter writer = HiveFileFormatUtils.getRecordWriter(
jc, hiveOutputFormat, outputClass, isCompressed,
tableInfo.getProperties(), path, reporter);
writer.close(false);
LOG.info("created empty bucket for enforcing bucketing at " + path);
}
}
/**
* Remove all temporary files and duplicate (double-committed) files from a given directory.
*/
public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws IOException {
removeTempOrDuplicateFiles(fs, path, null);
}
/**
* Remove all temporary files and duplicate (double-committed) files from a given directory.
*
* @return a list of path names corresponding to should-be-created empty buckets.
*/
public static ArrayList removeTempOrDuplicateFiles(FileSystem fs, Path path,
DynamicPartitionCtx dpCtx) throws IOException {
if (path == null) {
return null;
}
ArrayList result = new ArrayList();
if (dpCtx != null) {
FileStatus parts[] = HiveStatsUtils.getFileStatusRecurse(path, dpCtx.getNumDPCols(), fs);
HashMap taskIDToFile = null;
for (int i = 0; i < parts.length; ++i) {
assert parts[i].isDir() : "dynamic partition " + parts[i].getPath()
+ " is not a direcgtory";
FileStatus[] items = fs.listStatus(parts[i].getPath());
// remove empty directory since DP insert should not generate empty partitions.
// empty directories could be generated by crashed Task/ScriptOperator
if (items.length == 0) {
if (!fs.delete(parts[i].getPath(), true)) {
LOG.error("Cannot delete empty directory " + parts[i].getPath());
throw new IOException("Cannot delete empty directory " + parts[i].getPath());
}
}
taskIDToFile = removeTempOrDuplicateFiles(items, fs);
// if the table is bucketed and enforce bucketing, we should check and generate all buckets
if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) {
// refresh the file list
items = fs.listStatus(parts[i].getPath());
// get the missing buckets and generate empty buckets
String taskID1 = taskIDToFile.keySet().iterator().next();
Path bucketPath = taskIDToFile.values().iterator().next().getPath();
for (int j = 0; j < dpCtx.getNumBuckets(); ++j) {
String taskID2 = replaceTaskId(taskID1, j);
if (!taskIDToFile.containsKey(taskID2)) {
// create empty bucket, file name should be derived from taskID2
String path2 = replaceTaskIdFromFilename(bucketPath.toUri().getPath().toString(), j);
result.add(path2);
}
}
}
}
} else {
FileStatus[] items = fs.listStatus(path);
removeTempOrDuplicateFiles(items, fs);
}
return result;
}
public static HashMap removeTempOrDuplicateFiles(FileStatus[] items,
FileSystem fs) throws IOException {
if (items == null || fs == null) {
return null;
}
HashMap taskIdToFile = new HashMap();
for (FileStatus one : items) {
if (isTempPath(one)) {
if (!fs.delete(one.getPath(), true)) {
throw new IOException("Unable to delete tmp file: " + one.getPath());
}
} else {
String taskId = getPrefixedTaskIdFromFilename(one.getPath().getName());
FileStatus otherFile = taskIdToFile.get(taskId);
if (otherFile == null) {
taskIdToFile.put(taskId, one);
} else {
// Compare the file sizes of all the attempt files for the same task, the largest win
// any attempt files could contain partial results (due to task failures or
// speculative runs), but the largest should be the correct one since the result
// of a successful run should never be smaller than a failed/speculative run.
FileStatus toDelete = null;
if (otherFile.getLen() >= one.getLen()) {
toDelete = one;
} else {
toDelete = otherFile;
taskIdToFile.put(taskId, one);
}
long len1 = toDelete.getLen();
long len2 = taskIdToFile.get(taskId).getLen();
if (!fs.delete(toDelete.getPath(), true)) {
throw new IOException("Unable to delete duplicate file: " + toDelete.getPath()
+ ". Existing file: " + taskIdToFile.get(taskId).getPath());
} else {
LOG.warn("Duplicate taskid file removed: " + toDelete.getPath() + " with length "
+ len1 + ". Existing file: " + taskIdToFile.get(taskId).getPath() + " with length "
+ len2);
}
}
}
}
return taskIdToFile;
}
public static String getNameMessage(Exception e) {
return e.getClass().getName() + "(" + e.getMessage() + ")";
}
public static String getResourceFiles(Configuration conf, SessionState.ResourceType t) {
// fill in local files to be added to the task environment
SessionState ss = SessionState.get();
Set files = (ss == null) ? null : ss.list_resource(t, null);
if (files != null) {
List realFiles = new ArrayList(files.size());
for (String one : files) {
try {
realFiles.add(realFile(one, conf));
} catch (IOException e) {
throw new RuntimeException("Cannot validate file " + one + "due to exception: "
+ e.getMessage(), e);
}
}
return StringUtils.join(realFiles, ",");
} else {
return "";
}
}
/**
* Add new elements to the classpath.
*
* @param newPaths
* Array of classpath elements
*/
public static ClassLoader addToClassPath(ClassLoader cloader, String[] newPaths) throws Exception {
URLClassLoader loader = (URLClassLoader) cloader;
List curPath = Arrays.asList(loader.getURLs());
ArrayList newPath = new ArrayList();
// get a list with the current classpath components
for (URL onePath : curPath) {
newPath.add(onePath);
}
curPath = newPath;
for (String onestr : newPaths) {
// special processing for hadoop-17. file:// needs to be removed
if (StringUtils.indexOf(onestr, "file://") == 0) {
onestr = StringUtils.substring(onestr, 7);
}
URL oneurl = (new File(onestr)).toURL();
if (!curPath.contains(oneurl)) {
curPath.add(oneurl);
}
}
return new URLClassLoader(curPath.toArray(new URL[0]), loader);
}
/**
* remove elements from the classpath.
*
* @param pathsToRemove
* Array of classpath elements
*/
public static void removeFromClassPath(String[] pathsToRemove) throws Exception {
Thread curThread = Thread.currentThread();
URLClassLoader loader = (URLClassLoader) curThread.getContextClassLoader();
Set newPath = new HashSet(Arrays.asList(loader.getURLs()));
for (String onestr : pathsToRemove) {
// special processing for hadoop-17. file:// needs to be removed
if (StringUtils.indexOf(onestr, "file://") == 0) {
onestr = StringUtils.substring(onestr, 7);
}
URL oneurl = (new File(onestr)).toURL();
newPath.remove(oneurl);
}
loader = new URLClassLoader(newPath.toArray(new URL[0]));
curThread.setContextClassLoader(loader);
SessionState.get().getConf().setClassLoader(loader);
}
public static String formatBinaryString(byte[] array, int start, int length) {
StringBuilder sb = new StringBuilder();
for (int i = start; i < start + length; i++) {
sb.append("x");
sb.append(array[i] < 0 ? array[i] + 256 : array[i] + 0);
}
return sb.toString();
}
public static List getColumnNamesFromSortCols(List sortCols) {
List names = new ArrayList();
for (Order o : sortCols) {
names.add(o.getCol());
}
return names;
}
public static List getColumnNamesFromFieldSchema(List partCols) {
List names = new ArrayList();
for (FieldSchema o : partCols) {
names.add(o.getName());
}
return names;
}
public static List getInternalColumnNamesFromSignature(List colInfos) {
List names = new ArrayList();
for (ColumnInfo ci : colInfos) {
names.add(ci.getInternalName());
}
return names;
}
public static List getColumnNames(Properties props) {
List names = new ArrayList();
String colNames = props.getProperty(serdeConstants.LIST_COLUMNS);
String[] cols = colNames.trim().split(",");
if (cols != null) {
for (String col : cols) {
if (col != null && !col.trim().equals("")) {
names.add(col);
}
}
}
return names;
}
public static List getColumnTypes(Properties props) {
List names = new ArrayList();
String colNames = props.getProperty(serdeConstants.LIST_COLUMN_TYPES);
String[] cols = colNames.trim().split(",");
if (cols != null) {
for (String col : cols) {
if (col != null && !col.trim().equals("")) {
names.add(col);
}
}
}
return names;
}
/**
* Extract db and table name from dbtable string, where db and table are separated by "."
* If there is no db name part, set the current sessions default db
* @param dbtable
* @return String array with two elements, first is db name, second is table name
* @throws HiveException
*/
public static String[] getDbTableName(String dbtable) throws HiveException{
if(dbtable == null){
return new String[2];
}
String[] names = dbtable.split("\\.");
switch (names.length) {
case 2:
return names;
case 1:
return new String [] {SessionState.get().getCurrentDatabase(), dbtable};
default:
throw new HiveException(ErrorMsg.INVALID_TABLE_NAME, dbtable);
}
}
public static void validateColumnNames(List colNames, List checkCols)
throws SemanticException {
Iterator checkColsIter = checkCols.iterator();
while (checkColsIter.hasNext()) {
String toCheck = checkColsIter.next();
boolean found = false;
Iterator colNamesIter = colNames.iterator();
while (colNamesIter.hasNext()) {
String colName = colNamesIter.next();
if (toCheck.equalsIgnoreCase(colName)) {
found = true;
break;
}
}
if (!found) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg());
}
}
}
/**
* Gets the default notification interval to send progress updates to the tracker. Useful for
* operators that may not output data for a while.
*
* @param hconf
* @return the interval in milliseconds
*/
public static int getDefaultNotificationInterval(Configuration hconf) {
int notificationInterval;
Integer expInterval = Integer.decode(hconf.get("mapred.tasktracker.expiry.interval"));
if (expInterval != null) {
notificationInterval = expInterval.intValue() / 2;
} else {
// 5 minutes
notificationInterval = 5 * 60 * 1000;
}
return notificationInterval;
}
/**
* Copies the storage handler properties configured for a table descriptor to a runtime job
* configuration.
*
* @param tbl
* table descriptor from which to read
*
* @param job
* configuration which receives configured properties
*/
public static void copyTableJobPropertiesToConf(TableDesc tbl, JobConf job) {
String bucketString = tbl.getProperties()
.getProperty(hive_metastoreConstants.BUCKET_COUNT);
// copy the bucket count
if (bucketString != null) {
job.set(hive_metastoreConstants.BUCKET_COUNT, bucketString);
}
Map jobProperties = tbl.getJobProperties();
if (jobProperties == null) {
return;
}
for (Map.Entry entry : jobProperties.entrySet()) {
job.set(entry.getKey(), entry.getValue());
}
}
private static final Object INPUT_SUMMARY_LOCK = new Object();
/**
* Calculate the total size of input files.
*
* @param ctx
* the hadoop job context
* @param work
* map reduce job plan
* @param filter
* filter to apply to the input paths before calculating size
* @return the summary of all the input paths.
* @throws IOException
*/
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter)
throws IOException {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
long[] summary = {0, 0, 0};
final List pathNeedProcess = new ArrayList();
// Since multiple threads could call this method concurrently, locking
// this method will avoid number of threads out of control.
synchronized (INPUT_SUMMARY_LOCK) {
// For each input path, calculate the total size.
for (String path : work.getPathToAliases().keySet()) {
Path p = new Path(path);
if (filter != null && !filter.accept(p)) {
continue;
}
ContentSummary cs = ctx.getCS(path);
if (cs == null) {
if (path == null) {
continue;
}
pathNeedProcess.add(path);
} else {
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
}
}
// Process the case when name node call is needed
final Map resultMap = new ConcurrentHashMap();
ArrayList> results = new ArrayList>();
final ThreadPoolExecutor executor;
int maxThreads = ctx.getConf().getInt("mapred.dfsclient.parallelism.max", 0);
if (pathNeedProcess.size() > 1 && maxThreads > 1) {
int numExecutors = Math.min(pathNeedProcess.size(), maxThreads);
LOG.info("Using " + numExecutors + " threads for getContentSummary");
executor = new ThreadPoolExecutor(numExecutors, numExecutors, 60, TimeUnit.SECONDS,
new LinkedBlockingQueue());
} else {
executor = null;
}
HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
@Override
public void interrupt() {
for (String path : pathNeedProcess) {
try {
new Path(path).getFileSystem(ctx.getConf()).close();
} catch (IOException ignore) {
LOG.debug(ignore);
}
}
if (executor != null) {
executor.shutdownNow();
}
}
});
try {
Configuration conf = ctx.getConf();
JobConf jobConf = new JobConf(conf);
for (String path : pathNeedProcess) {
final Path p = new Path(path);
final String pathStr = path;
// All threads share the same Configuration and JobConf based on the
// assumption that they are thread safe if only read operations are
// executed. It is not stated in Hadoop's javadoc, the sourcce codes
// clearly showed that they made efforts for it and we believe it is
// thread safe. Will revisit this piece of codes if we find the assumption
// is not correct.
final Configuration myConf = conf;
final JobConf myJobConf = jobConf;
final Map> aliasToWork = work.getAliasToWork();
final Map> pathToAlias = work.getPathToAliases();
final PartitionDesc partDesc = work.getPathToPartitionInfo().get(
p.toString());
Runnable r = new Runnable() {
@Override
public void run() {
try {
Class extends InputFormat> inputFormatCls = partDesc
.getInputFileFormatClass();
InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
inputFormatCls, myJobConf);
if (inputFormatObj instanceof ContentSummaryInputFormat) {
ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
return;
}
HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf,
partDesc.getOverlayedProperties().getProperty(
hive_metastoreConstants.META_TABLE_STORAGE));
if (handler instanceof InputEstimator) {
long total = 0;
TableDesc tableDesc = partDesc.getTableDesc();
InputEstimator estimator = (InputEstimator) handler;
for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
JobConf jobConf = new JobConf(myJobConf);
TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
Utilities.setColumnNameList(jobConf, scanOp, true);
Utilities.setColumnTypeList(jobConf, scanOp, true);
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
total += estimator.estimate(myJobConf, scanOp, -1).getTotalLength();
}
resultMap.put(pathStr, new ContentSummary(total, -1, -1));
}
// todo: should nullify summary for non-native tables,
// not to be selected as a mapjoin target
FileSystem fs = p.getFileSystem(myConf);
resultMap.put(pathStr, fs.getContentSummary(p));
} catch (Exception e) {
// We safely ignore this exception for summary data.
// We don't update the cache to protect it from polluting other
// usages. The worst case is that IOException will always be
// retried for another getInputSummary(), which is fine as
// IOException is not considered as a common case.
LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
}
}
};
if (executor == null) {
r.run();
} else {
Future> result = executor.submit(r);
results.add(result);
}
}
if (executor != null) {
for (Future> result : results) {
boolean executorDone = false;
do {
try {
result.get();
executorDone = true;
} catch (InterruptedException e) {
LOG.info("Interrupted when waiting threads: ", e);
Thread.currentThread().interrupt();
break;
} catch (ExecutionException e) {
throw new IOException(e);
}
} while (!executorDone);
}
executor.shutdown();
}
HiveInterruptUtils.checkInterrupted();
for (Map.Entry entry : resultMap.entrySet()) {
ContentSummary cs = entry.getValue();
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
ctx.addCS(entry.getKey(), cs);
LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength()
+ " file count: "
+ cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
return new ContentSummary(summary[0], summary[1], summary[2]);
} finally {
HiveInterruptUtils.remove(interrup);
}
}
}
public static long sumOf(Map aliasToSize, Set aliases) {
return sumOfExcept(aliasToSize, aliases, null);
}
// return sum of lengths except some aliases. returns -1 if any of other alias is unknown
public static long sumOfExcept(Map aliasToSize,
Set aliases, Set excepts) {
long total = 0;
for (String alias : aliases) {
if (excepts != null && excepts.contains(alias)) {
continue;
}
Long size = aliasToSize.get(alias);
if (size == null) {
return -1;
}
total += size;
}
return total;
}
public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx)
throws Exception {
ContentSummary cs = ctx.getCS(dirPath);
if (cs != null) {
LOG.info("Content Summary " + dirPath + "length: " + cs.getLength() + " num files: "
+ cs.getFileCount() + " num directories: " + cs.getDirectoryCount());
return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1);
} else {
LOG.info("Content Summary not cached for " + dirPath);
}
return isEmptyPath(job, dirPath);
}
public static boolean isEmptyPath(JobConf job, Path dirPath) throws Exception {
FileSystem inpFs = dirPath.getFileSystem(job);
if (inpFs.exists(dirPath)) {
FileStatus[] fStats = inpFs.listStatus(dirPath);
if (fStats.length > 0) {
return false;
}
}
return true;
}
public static List getTezTasks(List> tasks) {
List tezTasks = new ArrayList();
if (tasks != null) {
getTezTasks(tasks, tezTasks);
}
return tezTasks;
}
private static void getTezTasks(List> tasks, List tezTasks) {
for (Task extends Serializable> task : tasks) {
if (task instanceof TezTask && !tezTasks.contains(task)) {
tezTasks.add((TezTask) task);
}
if (task.getDependentTasks() != null) {
getTezTasks(task.getDependentTasks(), tezTasks);
}
}
}
public static List getMRTasks(List> tasks) {
List mrTasks = new ArrayList();
if (tasks != null) {
getMRTasks(tasks, mrTasks);
}
return mrTasks;
}
private static void getMRTasks(List> tasks, List mrTasks) {
for (Task extends Serializable> task : tasks) {
if (task instanceof ExecDriver && !mrTasks.contains(task)) {
mrTasks.add((ExecDriver) task);
}
if (task.getDependentTasks() != null) {
getMRTasks(task.getDependentTasks(), mrTasks);
}
}
}
/**
* Construct a list of full partition spec from Dynamic Partition Context and the directory names
* corresponding to these dynamic partitions.
*/
public static List> getFullDPSpecs(Configuration conf,
DynamicPartitionCtx dpCtx) throws HiveException {
try {
Path loadPath = dpCtx.getRootPath();
FileSystem fs = loadPath.getFileSystem(conf);
int numDPCols = dpCtx.getNumDPCols();
FileStatus[] status = HiveStatsUtils.getFileStatusRecurse(loadPath, numDPCols, fs);
if (status.length == 0) {
LOG.warn("No partition is generated by dynamic partitioning");
return null;
}
// partial partition specification
Map partSpec = dpCtx.getPartSpec();
// list of full partition specification
List> fullPartSpecs = new ArrayList>();
// for each dynamically created DP directory, construct a full partition spec
// and load the partition based on that
for (int i = 0; i < status.length; ++i) {
// get the dynamically created directory
Path partPath = status[i].getPath();
assert fs.getFileStatus(partPath).isDir() : "partitions " + partPath
+ " is not a directory !";
// generate a full partition specification
LinkedHashMap fullPartSpec = new LinkedHashMap(partSpec);
Warehouse.makeSpecFromName(fullPartSpec, partPath);
fullPartSpecs.add(fullPartSpec);
}
return fullPartSpecs;
} catch (IOException e) {
throw new HiveException(e);
}
}
public static StatsPublisher getStatsPublisher(JobConf jc) {
StatsFactory factory = StatsFactory.newFactory(jc);
return factory == null ? null : factory.getStatsPublisher();
}
/**
* If statsPrefix's length is greater than maxPrefixLength and maxPrefixLength > 0,
* then it returns an MD5 hash of statsPrefix followed by path separator, otherwise
* it returns statsPrefix
*
* @param statsPrefix prefix of stats key
* @param maxPrefixLength max length of stats key
* @return if the length of prefix is longer than max, return MD5 hashed value of the prefix
*/
public static String getHashedStatsPrefix(String statsPrefix, int maxPrefixLength) {
// todo: this might return possibly longer prefix than
// maxPrefixLength (if set) when maxPrefixLength - postfixLength < 17,
// which would make stat values invalid (especially for 'counter' type)
if (maxPrefixLength >= 0 && statsPrefix.length() > maxPrefixLength) {
try {
MessageDigest digester = MessageDigest.getInstance("MD5");
digester.update(statsPrefix.getBytes());
return new String(digester.digest()) + Path.SEPARATOR; // 17 byte
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
return statsPrefix.endsWith(Path.SEPARATOR) ? statsPrefix : statsPrefix + Path.SEPARATOR;
}
public static String join(String... elements) {
StringBuilder builder = new StringBuilder();
for (String element : elements) {
if (element == null || element.isEmpty()) {
continue;
}
builder.append(element);
if (!element.endsWith(Path.SEPARATOR)) {
builder.append(Path.SEPARATOR);
}
}
return builder.toString();
}
public static void setColumnNameList(JobConf jobConf, Operator op) {
setColumnNameList(jobConf, op, false);
}
public static void setColumnNameList(JobConf jobConf, Operator op, boolean excludeVCs) {
RowSchema rowSchema = op.getSchema();
if (rowSchema == null) {
return;
}
StringBuilder columnNames = new StringBuilder();
for (ColumnInfo colInfo : rowSchema.getSignature()) {
if (excludeVCs && colInfo.getIsVirtualCol()) {
continue;
}
if (columnNames.length() > 0) {
columnNames.append(",");
}
columnNames.append(colInfo.getInternalName());
}
String columnNamesString = columnNames.toString();
jobConf.set(serdeConstants.LIST_COLUMNS, columnNamesString);
}
public static void setColumnTypeList(JobConf jobConf, Operator op) {
setColumnTypeList(jobConf, op, false);
}
public static void setColumnTypeList(JobConf jobConf, Operator op, boolean excludeVCs) {
RowSchema rowSchema = op.getSchema();
if (rowSchema == null) {
return;
}
StringBuilder columnTypes = new StringBuilder();
for (ColumnInfo colInfo : rowSchema.getSignature()) {
if (excludeVCs && colInfo.getIsVirtualCol()) {
continue;
}
if (columnTypes.length() > 0) {
columnTypes.append(",");
}
columnTypes.append(colInfo.getTypeName());
}
String columnTypesString = columnTypes.toString();
jobConf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypesString);
}
public static String suffix = ".hashtable";
public static Path generatePath(Path basePath, String dumpFilePrefix,
Byte tag, String bigBucketFileName) {
return new Path(basePath, "MapJoin-" + dumpFilePrefix + tag +
"-" + bigBucketFileName + suffix);
}
public static String generateFileName(Byte tag, String bigBucketFileName) {
String fileName = new String("MapJoin-" + tag + "-" + bigBucketFileName + suffix);
return fileName;
}
public static Path generateTmpPath(Path basePath, String id) {
return new Path(basePath, "HashTable-" + id);
}
public static Path generateTarPath(Path basePath, String filename) {
return new Path(basePath, filename + ".tar.gz");
}
public static String generateTarFileName(String name) {
return name + ".tar.gz";
}
public static String generatePath(Path baseURI, String filename) {
String path = new String(baseURI + Path.SEPARATOR + filename);
return path;
}
public static String now() {
Calendar cal = Calendar.getInstance();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
return sdf.format(cal.getTime());
}
public static double showTime(long time) {
double result = (double) time / (double) 1000;
return result;
}
/**
* The check here is kind of not clean. It first use a for loop to go through
* all input formats, and choose the ones that extend ReworkMapredInputFormat
* to a set. And finally go through the ReworkMapredInputFormat set, and call
* rework for each one.
*
* Technically all these can be avoided if all Hive's input formats can share
* a same interface. As in today's hive and Hadoop, it is not possible because
* a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
* input formats just extend InputFormat interface.
*
* @param task
* @param reworkMapredWork
* @param conf
* @throws SemanticException
*/
public static void reworkMapRedWork(Task extends Serializable> task,
boolean reworkMapredWork, HiveConf conf) throws SemanticException {
if (reworkMapredWork && (task instanceof MapRedTask)) {
try {
MapredWork mapredWork = ((MapRedTask) task).getWork();
Set> reworkInputFormats = new HashSet>();
for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
Class extends InputFormat> inputFormatCls = part
.getInputFileFormatClass();
if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
reworkInputFormats.add(inputFormatCls);
}
}
if (reworkInputFormats.size() > 0) {
for (Class extends InputFormat> inputFormatCls : reworkInputFormats) {
ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtils
.newInstance(inputFormatCls, null);
inst.rework(conf, mapredWork);
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
}
public static class SQLCommand {
public T run(PreparedStatement stmt) throws SQLException {
return null;
}
}
/**
* Retry SQL execution with random backoff (same as the one implemented in HDFS-767).
* This function only retries when the SQL query throws a SQLTransientException (which
* might be able to succeed with a simple retry). It doesn't retry when the exception
* is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
* the caller needs to reconnect to the database and restart the whole transaction.
*
* @param cmd the SQL command
* @param stmt the prepared statement of SQL.
* @param baseWindow The base time window (in milliseconds) before the next retry.
* see {@link #getRandomWaitTime} for details.
* @param maxRetries the maximum # of retries when getting a SQLTransientException.
* @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
* first time it is caught, or SQLTransientException when the maxRetries has reached.
*/
public static T executeWithRetry(SQLCommand cmd, PreparedStatement stmt,
int baseWindow, int maxRetries) throws SQLException {
Random r = new Random();
T result = null;
// retry with # of maxRetries before throwing exception
for (int failures = 0; ; failures++) {
try {
result = cmd.run(stmt);
return result;
} catch (SQLTransientException e) {
LOG.warn("Failure and retry #" + failures + " with exception " + e.getMessage());
if (failures >= maxRetries) {
throw e;
}
long waitTime = getRandomWaitTime(baseWindow, failures, r);
try {
Thread.sleep(waitTime);
} catch (InterruptedException iex) {
}
} catch (SQLException e) {
// throw other types of SQLExceptions (SQLNonTransientException / SQLRecoverableException)
throw e;
}
}
}
/**
* Retry connecting to a database with random backoff (same as the one implemented in HDFS-767).
* This function only retries when the SQL query throws a SQLTransientException (which
* might be able to succeed with a simple retry). It doesn't retry when the exception
* is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
* the caller needs to reconnect to the database and restart the whole transaction.
*
* @param connectionString the JDBC connection string.
* @param waitWindow The base time window (in milliseconds) before the next retry.
* see {@link #getRandomWaitTime} for details.
* @param maxRetries the maximum # of retries when getting a SQLTransientException.
* @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
* first time it is caught, or SQLTransientException when the maxRetries has reached.
*/
public static Connection connectWithRetry(String connectionString,
int waitWindow, int maxRetries) throws SQLException {
Random r = new Random();
// retry with # of maxRetries before throwing exception
for (int failures = 0; ; failures++) {
try {
Connection conn = DriverManager.getConnection(connectionString);
return conn;
} catch (SQLTransientException e) {
if (failures >= maxRetries) {
LOG.error("Error during JDBC connection. " + e);
throw e;
}
long waitTime = Utilities.getRandomWaitTime(waitWindow, failures, r);
try {
Thread.sleep(waitTime);
} catch (InterruptedException e1) {
}
} catch (SQLException e) {
// just throw other types (SQLNonTransientException / SQLRecoverableException)
throw e;
}
}
}
/**
* Retry preparing a SQL statement with random backoff (same as the one implemented in HDFS-767).
* This function only retries when the SQL query throws a SQLTransientException (which
* might be able to succeed with a simple retry). It doesn't retry when the exception
* is a SQLRecoverableException or SQLNonTransientException. For SQLRecoverableException
* the caller needs to reconnect to the database and restart the whole transaction.
*
* @param conn a JDBC connection.
* @param stmt the SQL statement to be prepared.
* @param waitWindow The base time window (in milliseconds) before the next retry.
* see {@link #getRandomWaitTime} for details.
* @param maxRetries the maximum # of retries when getting a SQLTransientException.
* @throws SQLException throws SQLRecoverableException or SQLNonTransientException the
* first time it is caught, or SQLTransientException when the maxRetries has reached.
*/
public static PreparedStatement prepareWithRetry(Connection conn, String stmt,
int waitWindow, int maxRetries) throws SQLException {
Random r = new Random();
// retry with # of maxRetries before throwing exception
for (int failures = 0; ; failures++) {
try {
return conn.prepareStatement(stmt);
} catch (SQLTransientException e) {
if (failures >= maxRetries) {
LOG.error("Error preparing JDBC Statement " + stmt + " :" + e);
throw e;
}
long waitTime = Utilities.getRandomWaitTime(waitWindow, failures, r);
try {
Thread.sleep(waitTime);
} catch (InterruptedException e1) {
}
} catch (SQLException e) {
// just throw other types (SQLNonTransientException / SQLRecoverableException)
throw e;
}
}
}
/**
* Introducing a random factor to the wait time before another retry.
* The wait time is dependent on # of failures and a random factor.
* At the first time of getting an exception , the wait time
* is a random number between 0..baseWindow msec. If the first retry
* still fails, we will wait baseWindow msec grace period before the 2nd retry.
* Also at the second retry, the waiting window is expanded to 2*baseWindow msec
* alleviating the request rate from the server. Similarly the 3rd retry
* will wait 2*baseWindow msec. grace period before retry and the waiting window is
* expanded to 3*baseWindow msec and so on.
* @param baseWindow the base waiting window.
* @param failures number of failures so far.
* @param r a random generator.
* @return number of milliseconds for the next wait time.
*/
public static long getRandomWaitTime(int baseWindow, int failures, Random r) {
return (long) (
baseWindow * failures + // grace period for the last round of attempt
baseWindow * (failures + 1) * r.nextDouble()); // expanding time window for each failure
}
public static final char sqlEscapeChar = '\\';
/**
* Escape the '_', '%', as well as the escape characters inside the string key.
* @param key the string that will be used for the SQL LIKE operator.
* @return a string with escaped '_' and '%'.
*/
public static String escapeSqlLike(String key) {
StringBuffer sb = new StringBuffer(key.length());
for (char c: key.toCharArray()) {
switch(c) {
case '_':
case '%':
case sqlEscapeChar:
sb.append(sqlEscapeChar);
// fall through
default:
sb.append(c);
break;
}
}
return sb.toString();
}
/**
* Format number of milliseconds to strings
*
* @param msec milliseconds
* @return a formatted string like "x days y hours z minutes a seconds b msec"
*/
public static String formatMsecToStr(long msec) {
long day = -1, hour = -1, minute = -1, second = -1;
long ms = msec % 1000;
long timeLeft = msec / 1000;
if (timeLeft > 0) {
second = timeLeft % 60;
timeLeft /= 60;
if (timeLeft > 0) {
minute = timeLeft % 60;
timeLeft /= 60;
if (timeLeft > 0) {
hour = timeLeft % 24;
day = timeLeft / 24;
}
}
}
StringBuilder sb = new StringBuilder();
if (day != -1) {
sb.append(day + " days ");
}
if (hour != -1) {
sb.append(hour + " hours ");
}
if (minute != -1) {
sb.append(minute + " minutes ");
}
if (second != -1) {
sb.append(second + " seconds ");
}
sb.append(ms + " msec");
return sb.toString();
}
/**
* Estimate the number of reducers needed for this job, based on job input,
* and configuration parameters.
*
* The output of this method should only be used if the output of this
* MapRedTask is not being used to populate a bucketed table and the user
* has not specified the number of reducers to use.
*
* @return the number of reducers.
*/
public static int estimateNumberOfReducers(HiveConf conf, ContentSummary inputSummary,
MapWork work, boolean finalMapRed) throws IOException {
long bytesPerReducer = conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
int maxReducers = conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
double samplePercentage = getHighestSamplePercentage(work);
long totalInputFileSize = getTotalInputFileSize(inputSummary, work, samplePercentage);
// if all inputs are sampled, we should shrink the size of reducers accordingly.
if (totalInputFileSize != inputSummary.getLength()) {
LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
+ maxReducers + " estimated totalInputFileSize=" + totalInputFileSize);
} else {
LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
+ maxReducers + " totalInputFileSize=" + totalInputFileSize);
}
// If this map reduce job writes final data to a table and bucketing is being inferred,
// and the user has configured Hive to do this, make sure the number of reducers is a
// power of two
boolean powersOfTwo = conf.getBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT_NUM_BUCKETS_POWER_TWO) &&
finalMapRed && !work.getBucketedColsByDirectory().isEmpty();
return estimateReducers(totalInputFileSize, bytesPerReducer, maxReducers, powersOfTwo);
}
public static int estimateReducers(long totalInputFileSize, long bytesPerReducer,
int maxReducers, boolean powersOfTwo) {
int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / bytesPerReducer);
reducers = Math.max(1, reducers);
reducers = Math.min(maxReducers, reducers);
int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
int reducersPowerTwo = (int)Math.pow(2, reducersLog);
if (powersOfTwo) {
// If the original number of reducers was a power of two, use that
if (reducersPowerTwo / 2 == reducers) {
// nothing to do
} else if (reducersPowerTwo > maxReducers) {
// If the next power of two greater than the original number of reducers is greater
// than the max number of reducers, use the preceding power of two, which is strictly
// less than the original number of reducers and hence the max
reducers = reducersPowerTwo / 2;
} else {
// Otherwise use the smallest power of two greater than the original number of reducers
reducers = reducersPowerTwo;
}
}
return reducers;
}
/**
* Computes the total input file size. If block sampling was used it will scale this
* value by the highest sample percentage (as an estimate for input).
*
* @param inputSummary
* @param work
* @param highestSamplePercentage
* @return estimated total input size for job
*/
public static long getTotalInputFileSize (ContentSummary inputSummary, MapWork work,
double highestSamplePercentage) {
long totalInputFileSize = inputSummary.getLength();
if (work.getNameToSplitSample() == null || work.getNameToSplitSample().isEmpty()) {
// If percentage block sampling wasn't used, we don't need to do any estimation
return totalInputFileSize;
}
if (highestSamplePercentage >= 0) {
totalInputFileSize = Math.min((long) (totalInputFileSize * highestSamplePercentage / 100D)
, totalInputFileSize);
}
return totalInputFileSize;
}
/**
* Computes the total number of input files. If block sampling was used it will scale this
* value by the highest sample percentage (as an estimate for # input files).
*
* @param inputSummary
* @param work
* @param highestSamplePercentage
* @return
*/
public static long getTotalInputNumFiles (ContentSummary inputSummary, MapWork work,
double highestSamplePercentage) {
long totalInputNumFiles = inputSummary.getFileCount();
if (work.getNameToSplitSample() == null || work.getNameToSplitSample().isEmpty()) {
// If percentage block sampling wasn't used, we don't need to do any estimation
return totalInputNumFiles;
}
if (highestSamplePercentage >= 0) {
totalInputNumFiles = Math.min((long) (totalInputNumFiles * highestSamplePercentage / 100D)
, totalInputNumFiles);
}
return totalInputNumFiles;
}
/**
* Returns the highest sample percentage of any alias in the given MapWork
*/
public static double getHighestSamplePercentage (MapWork work) {
double highestSamplePercentage = 0;
for (String alias : work.getAliasToWork().keySet()) {
if (work.getNameToSplitSample().containsKey(alias)) {
Double rate = work.getNameToSplitSample().get(alias).getPercent();
if (rate != null && rate > highestSamplePercentage) {
highestSamplePercentage = rate;
}
} else {
highestSamplePercentage = -1;
break;
}
}
return highestSamplePercentage;
}
/**
* On Tez we're not creating dummy files when getting/setting input paths.
* We let Tez handle the situation. We're also setting the paths in the AM
* so we don't want to depend on scratch dir and context.
*/
public static List getInputPathsTez(JobConf job, MapWork work) throws Exception {
List paths = getInputPaths(job, work, null, null);
return paths;
}
/**
* Computes a list of all input paths needed to compute the given MapWork. All aliases
* are considered and a merged list of input paths is returned. If any input path points
* to an empty table or partition a dummy file in the scratch dir is instead created and
* added to the list. This is needed to avoid special casing the operator pipeline for
* these cases.
*
* @param job JobConf used to run the job
* @param work MapWork encapsulating the info about the task
* @param hiveScratchDir The tmp dir used to create dummy files if needed
* @param ctx Context object
* @return List of paths to process for the given MapWork
* @throws Exception
*/
public static List getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx)
throws Exception {
int sequenceNumber = 0;
Set pathsProcessed = new HashSet();
List pathsToAdd = new LinkedList();
// AliasToWork contains all the aliases
for (String alias : work.getAliasToWork().keySet()) {
LOG.info("Processing alias " + alias);
// The alias may not have any path
Path path = null;
for (String file : new LinkedList(work.getPathToAliases().keySet())) {
List aliases = work.getPathToAliases().get(file);
if (aliases.contains(alias)) {
path = new Path(file);
// Multiple aliases can point to the same path - it should be
// processed only once
if (pathsProcessed.contains(path)) {
continue;
}
pathsProcessed.add(path);
LOG.info("Adding input file " + path);
if (!HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")
&& isEmptyPath(job, path, ctx)) {
path = createDummyFileForEmptyPartition(path, job, work,
hiveScratchDir, alias, sequenceNumber++);
}
pathsToAdd.add(path);
}
}
// If the query references non-existent partitions
// We need to add a empty file, it is not acceptable to change the
// operator tree
// Consider the query:
// select * from (select count(1) from T union all select count(1) from
// T2) x;
// If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2
// rows)
if (path == null
&& !HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
path = createDummyFileForEmptyTable(job, work, hiveScratchDir,
alias, sequenceNumber++);
pathsToAdd.add(path);
}
}
return pathsToAdd;
}
@SuppressWarnings({"rawtypes", "unchecked"})
private static Path createEmptyFile(Path hiveScratchDir,
Class extends HiveOutputFormat> outFileFormat, JobConf job,
int sequenceNumber, Properties props, boolean dummyRow)
throws IOException, InstantiationException, IllegalAccessException {
// create a dummy empty file in a new directory
String newDir = hiveScratchDir + File.separator + sequenceNumber;
Path newPath = new Path(newDir);
FileSystem fs = newPath.getFileSystem(job);
fs.mkdirs(newPath);
//Qualify the path against the file system. The user configured path might contain default port which is skipped
//in the file status. This makes sure that all paths which goes into PathToPartitionInfo are always listed status
//file path.
newPath = fs.makeQualified(newPath);
String newFile = newDir + File.separator + "emptyFile";
Path newFilePath = new Path(newFile);
RecordWriter recWriter = outFileFormat.newInstance().getHiveRecordWriter(job, newFilePath,
Text.class, false, props, null);
if (dummyRow) {
// empty files are omitted at CombineHiveInputFormat.
// for meta-data only query, it effectively makes partition columns disappear..
// this could be fixed by other methods, but this seemed to be the most easy (HIVEV-2955)
recWriter.write(new Text("empty")); // written via HiveIgnoreKeyTextOutputFormat
}
recWriter.close(false);
return newPath;
}
@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyPartition(Path path, JobConf job, MapWork work,
Path hiveScratchDir, String alias, int sequenceNumber)
throws IOException, InstantiationException, IllegalAccessException {
String strPath = path.toString();
// The input file does not exist, replace it by a empty file
PartitionDesc partDesc = work.getPathToPartitionInfo().get(strPath);
boolean nonNative = partDesc.getTableDesc().isNonNative();
boolean oneRow = partDesc.getInputFileFormatClass() == OneNullRowInputFormat.class;
Properties props = partDesc.getProperties();
Class extends HiveOutputFormat> outFileFormat = partDesc.getOutputFileFormatClass();
if (nonNative) {
// if this isn't a hive table we can't create an empty file for it.
return path;
}
Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job,
sequenceNumber, props, oneRow);
LOG.info("Changed input file to " + newPath);
// update the work
String strNewPath = newPath.toString();
LinkedHashMap> pathToAliases = work.getPathToAliases();
pathToAliases.put(strNewPath, pathToAliases.get(strPath));
pathToAliases.remove(strPath);
work.setPathToAliases(pathToAliases);
LinkedHashMap pathToPartitionInfo = work.getPathToPartitionInfo();
pathToPartitionInfo.put(strNewPath, pathToPartitionInfo.get(strPath));
pathToPartitionInfo.remove(strPath);
work.setPathToPartitionInfo(pathToPartitionInfo);
return newPath;
}
@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyTable(JobConf job, MapWork work,
Path hiveScratchDir, String alias, int sequenceNumber)
throws IOException, InstantiationException, IllegalAccessException {
TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
Properties props = tableDesc.getProperties();
boolean nonNative = tableDesc.isNonNative();
Class extends HiveOutputFormat> outFileFormat = tableDesc.getOutputFileFormatClass();
if (nonNative) {
// if this isn't a hive table we can't create an empty file for it.
return null;
}
Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job,
sequenceNumber, props, false);
LOG.info("Changed input file to " + newPath.toString());
// update the work
LinkedHashMap> pathToAliases = work.getPathToAliases();
ArrayList newList = new ArrayList();
newList.add(alias);
pathToAliases.put(newPath.toUri().toString(), newList);
work.setPathToAliases(pathToAliases);
LinkedHashMap pathToPartitionInfo = work.getPathToPartitionInfo();
PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
pathToPartitionInfo.put(newPath.toUri().toString(), pDesc);
work.setPathToPartitionInfo(pathToPartitionInfo);
return newPath;
}
/**
* setInputPaths add all the paths in the provided list to the Job conf object
* as input paths for the job.
*
* @param job
* @param pathsToAdd
*/
public static void setInputPaths(JobConf job, List pathsToAdd) {
Path[] addedPaths = FileInputFormat.getInputPaths(job);
if (addedPaths == null) {
addedPaths = new Path[0];
}
Path[] combined = new Path[addedPaths.length + pathsToAdd.size()];
System.arraycopy(addedPaths, 0, combined, 0, addedPaths.length);
int i = 0;
for(Path p: pathsToAdd) {
combined[addedPaths.length + (i++)] = p;
}
FileInputFormat.setInputPaths(job, combined);
}
/**
* Set hive input format, and input format file if necessary.
*/
public static void setInputAttributes(Configuration conf, MapWork mWork) {
HiveConf.ConfVars var = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") ?
HiveConf.ConfVars.HIVETEZINPUTFORMAT : HiveConf.ConfVars.HIVEINPUTFORMAT;
if (mWork.getInputformat() != null) {
HiveConf.setVar(conf, var, mWork.getInputformat());
}
if (mWork.getIndexIntermediateFile() != null) {
conf.set("hive.index.compact.file", mWork.getIndexIntermediateFile());
conf.set("hive.index.blockfilter.file", mWork.getIndexIntermediateFile());
}
// Intentionally overwrites anything the user may have put here
conf.setBoolean("hive.input.format.sorted", mWork.isInputFormatSorted());
}
/**
* Hive uses tmp directories to capture the output of each FileSinkOperator.
* This method creates all necessary tmp directories for FileSinks in the Mapwork.
*
* @param conf Used to get the right FileSystem
* @param mWork Used to find FileSinkOperators
* @throws IOException
*/
public static void createTmpDirs(Configuration conf, MapWork mWork)
throws IOException {
Map> pa = mWork.getPathToAliases();
if (pa != null) {
List> ops =
new ArrayList>();
for (List ls : pa.values()) {
for (String a : ls) {
ops.add(mWork.getAliasToWork().get(a));
}
}
createTmpDirs(conf, ops);
}
}
/**
* Hive uses tmp directories to capture the output of each FileSinkOperator.
* This method creates all necessary tmp directories for FileSinks in the ReduceWork.
*
* @param conf Used to get the right FileSystem
* @param rWork Used to find FileSinkOperators
* @throws IOException
*/
@SuppressWarnings("unchecked")
public static void createTmpDirs(Configuration conf, ReduceWork rWork)
throws IOException {
if (rWork == null) {
return;
}
List> ops
= new LinkedList>();
ops.add(rWork.getReducer());
createTmpDirs(conf, ops);
}
private static void createTmpDirs(Configuration conf,
List> ops) throws IOException {
FsPermission fsPermission = new FsPermission((short)00777);
while (!ops.isEmpty()) {
Operator extends OperatorDesc> op = ops.remove(0);
if (op instanceof FileSinkOperator) {
FileSinkDesc fdesc = ((FileSinkOperator) op).getConf();
Path tempDir = fdesc.getDirName();
if (tempDir != null) {
Path tempPath = Utilities.toTempPath(tempDir);
createDirsWithPermission(conf, tempPath, fsPermission);
}
}
if (op.getChildOperators() != null) {
ops.addAll(op.getChildOperators());
}
}
}
/**
* Returns true if a plan is both configured for vectorized execution
* and vectorization is allowed. The plan may be configured for vectorization
* but vectorization dissalowed eg. for FetchOperator execution.
*/
public static boolean isVectorMode(Configuration conf) {
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) &&
Utilities.getPlanPath(conf) != null && Utilities
.getMapRedWork(conf).getMapWork().getVectorMode()) {
return true;
}
return false;
}
public static void clearWorkMap() {
gWorkMap.clear();
}
/**
* Create a temp dir in specified baseDir
* This can go away once hive moves to support only JDK 7
* and can use Files.createTempDirectory
* Guava Files.createTempDir() does not take a base dir
* @param baseDir - directory under which new temp dir will be created
* @return File object for new temp dir
*/
public static File createTempDir(String baseDir){
//try creating the temp dir MAX_ATTEMPTS times
final int MAX_ATTEMPS = 30;
for(int i = 0; i < MAX_ATTEMPS; i++){
//pick a random file name
String tempDirName = "tmp_" + ((int)(100000 * Math.random()));
//return if dir could successfully be created with that file name
File tempDir = new File(baseDir, tempDirName);
if(tempDir.mkdir()){
return tempDir;
}
}
throw new IllegalStateException("Failed to create a temp dir under "
+ baseDir + " Giving up after " + MAX_ATTEMPS + " attemps");
}
/**
* Skip header lines in the table file when reading the record.
*
* @param currRecReader
* Record reader.
*
* @param headerCount
* Header line number of the table files.
*
* @param key
* Key of current reading record.
*
* @param value
* Value of current reading record.
*
* @return Return true if there are 0 or more records left in the file
* after skipping all headers, otherwise return false.
*/
public static boolean skipHeader(RecordReader currRecReader,
int headerCount, WritableComparable key, Writable value) throws IOException {
while (headerCount > 0) {
if (!currRecReader.next(key, value))
return false;
headerCount--;
}
return true;
}
/**
* Get header line count for a table.
*
* @param table
* Table description for target table.
*
*/
public static int getHeaderCount(TableDesc table) throws IOException {
int headerCount;
try {
headerCount = Integer.parseInt(table.getProperties().getProperty(serdeConstants.HEADER_COUNT, "0"));
} catch (NumberFormatException nfe) {
throw new IOException(nfe);
}
return headerCount;
}
/**
* Get footer line count for a table.
*
* @param table
* Table description for target table.
*
* @param job
* Job configuration for current job.
*/
public static int getFooterCount(TableDesc table, JobConf job) throws IOException {
int footerCount;
try {
footerCount = Integer.parseInt(table.getProperties().getProperty(serdeConstants.FOOTER_COUNT, "0"));
if (footerCount > HiveConf.getIntVar(job, HiveConf.ConfVars.HIVE_FILE_MAX_FOOTER)) {
throw new IOException("footer number exceeds the limit defined in hive.file.max.footer");
}
} catch (NumberFormatException nfe) {
// Footer line number must be set as an integer.
throw new IOException(nfe);
}
return footerCount;
}
/**
* @param conf the configuration used to derive the filesystem to create the path
* @param mkdir the path to be created
* @param fsPermission ignored if it is hive server session and doAs is enabled
* @return true if successfully created the directory else false
* @throws IOException if hdfs experiences any error conditions
*/
public static boolean createDirsWithPermission(Configuration conf, Path mkdir,
FsPermission fsPermission) throws IOException {
boolean recursive = false;
if (SessionState.get() != null) {
recursive = SessionState.get().isHiveServerQuery() &&
conf.getBoolean(HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS.varname,
HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS.defaultBoolVal);
// we reset the permission in case of hive server and doAs enabled because
// currently scratch directory uses /tmp/hive-hive as the scratch directory.
// However, with doAs enabled, the first user to create this directory would
// own the directory and subsequent users cannot access the scratch directory.
// The right fix is to have scratch dir per user.
fsPermission = new FsPermission((short)00777);
}
// if we made it so far without exception we are good!
return createDirsWithPermission(conf, mkdir, fsPermission, recursive);
}
private static void resetConfAndCloseFS (Configuration conf, boolean unsetUmask,
String origUmask, FileSystem fs) throws IOException {
if (unsetUmask) {
if (origUmask != null) {
conf.set("fs.permissions.umask-mode", origUmask);
} else {
conf.unset("fs.permissions.umask-mode");
}
}
fs.close();
}
public static boolean createDirsWithPermission(Configuration conf, Path mkdirPath,
FsPermission fsPermission, boolean recursive) throws IOException {
String origUmask = null;
LOG.debug("Create dirs " + mkdirPath + " with permission " + fsPermission + " recursive " +
recursive);
if (recursive) {
origUmask = conf.get("fs.permissions.umask-mode");
// this umask is required because by default the hdfs mask is 022 resulting in
// all parents getting the fsPermission & !(022) permission instead of fsPermission
conf.set("fs.permissions.umask-mode", "000");
}
FileSystem fs = ShimLoader.getHadoopShims().getNonCachedFileSystem(mkdirPath.toUri(), conf);
boolean retval = false;
try {
retval = fs.mkdirs(mkdirPath, fsPermission);
resetConfAndCloseFS(conf, recursive, origUmask, fs);
} catch (IOException ioe) {
try {
resetConfAndCloseFS(conf, recursive, origUmask, fs);
}
catch (IOException e) {
// do nothing - double failure
}
}
return retval;
}
}