Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.api.DataOperationType;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.metastore.TransactionalValidationListener;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.shims.HadoopShims;
import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hive.common.util.Ref;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
/**
* Utilities that are shared by all of the ACID input and output formats. They
* are used by the compactor and cleaner and thus must be format agnostic.
*/
public class AcidUtils {
// This key will be put in the conf file when planning an acid operation
public static final String CONF_ACID_KEY = "hive.doing.acid";
public static final String BASE_PREFIX = "base_";
public static final PathFilter baseFileFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(BASE_PREFIX);
}
};
public static final String DELTA_PREFIX = "delta_";
public static final String DELETE_DELTA_PREFIX = "delete_delta_";
public static final String DELTA_SIDE_FILE_SUFFIX = "_flush_length";
public static final PathFilter deltaFileFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(DELTA_PREFIX);
}
};
public static final PathFilter deleteEventDeltaDirFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(DELETE_DELTA_PREFIX);
}
};
public static final String BUCKET_PREFIX = "bucket_";
public static final PathFilter bucketFileFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(BUCKET_PREFIX) &&
!path.getName().endsWith(DELTA_SIDE_FILE_SUFFIX);
}
};
public static final String BUCKET_DIGITS = "%05d";
public static final String LEGACY_FILE_BUCKET_DIGITS = "%06d";
public static final String DELTA_DIGITS = "%07d";
/**
* 10K statements per tx. Probably overkill ... since that many delta files
* would not be good for performance
*/
public static final String STATEMENT_DIGITS = "%04d";
/**
* This must be in sync with {@link #STATEMENT_DIGITS}
*/
public static final int MAX_STATEMENTS_PER_TXN = 10000;
public static final Pattern BUCKET_DIGIT_PATTERN = Pattern.compile("[0-9]{5}$");
public static final Pattern LEGACY_BUCKET_DIGIT_PATTERN = Pattern.compile("^[0-9]{6}");
public static final PathFilter originalBucketFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return ORIGINAL_PATTERN.matcher(path.getName()).matches();
}
};
private AcidUtils() {
// NOT USED
}
private static final Logger LOG = LoggerFactory.getLogger(AcidUtils.class);
private static final Pattern ORIGINAL_PATTERN =
Pattern.compile("[0-9]+_[0-9]+");
public static final PathFilter hiddenFileFilter = new PathFilter(){
@Override
public boolean accept(Path p){
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
private static final HadoopShims SHIMS = ShimLoader.getHadoopShims();
/**
* Create the bucket filename.
* @param subdir the subdirectory for the bucket.
* @param bucket the bucket number
* @return the filename
*/
public static Path createBucketFile(Path subdir, int bucket) {
return new Path(subdir,
BUCKET_PREFIX + String.format(BUCKET_DIGITS, bucket));
}
/**
* This is format of delta dir name prior to Hive 1.3.x
*/
public static String deltaSubdir(long min, long max) {
return DELTA_PREFIX + String.format(DELTA_DIGITS, min) + "_" +
String.format(DELTA_DIGITS, max);
}
/**
* Each write statement in a transaction creates its own delta dir.
* @since 1.3.x
*/
public static String deltaSubdir(long min, long max, int statementId) {
return deltaSubdir(min, max) + "_" + String.format(STATEMENT_DIGITS, statementId);
}
/**
* This is format of delete delta dir name prior to Hive 2.2.x
*/
@VisibleForTesting
static String deleteDeltaSubdir(long min, long max) {
return DELETE_DELTA_PREFIX + String.format(DELTA_DIGITS, min) + "_" +
String.format(DELTA_DIGITS, max);
}
/**
* Each write statement in a transaction creates its own delete delta dir,
* when split-update acid operational property is turned on.
* @since 2.2.x
*/
@VisibleForTesting
static String deleteDeltaSubdir(long min, long max, int statementId) {
return deleteDeltaSubdir(min, max) + "_" + String.format(STATEMENT_DIGITS, statementId);
}
public static String baseDir(long txnId) {
return BASE_PREFIX + String.format(DELTA_DIGITS, txnId);
}
/**
* Create a filename for a bucket file.
* @param directory the partition directory
* @param options the options for writing the bucket
* @return the filename that should store the bucket
*/
public static Path createFilename(Path directory,
AcidOutputFormat.Options options) {
String subdir;
if (options.getOldStyle()) {
return new Path(directory, String.format(LEGACY_FILE_BUCKET_DIGITS,
options.getBucket()) + "_0");
} else if (options.isWritingBase()) {
subdir = BASE_PREFIX + String.format(DELTA_DIGITS,
options.getMaximumTransactionId());
} else if(options.getStatementId() == -1) {
//when minor compaction runs, we collapse per statement delta files inside a single
//transaction so we no longer need a statementId in the file name
subdir = options.isWritingDeleteDelta() ?
deleteDeltaSubdir(options.getMinimumTransactionId(),
options.getMaximumTransactionId())
: deltaSubdir(options.getMinimumTransactionId(),
options.getMaximumTransactionId());
} else {
subdir = options.isWritingDeleteDelta() ?
deleteDeltaSubdir(options.getMinimumTransactionId(),
options.getMaximumTransactionId(),
options.getStatementId())
: deltaSubdir(options.getMinimumTransactionId(),
options.getMaximumTransactionId(),
options.getStatementId());
}
return createBucketFile(new Path(directory, subdir), options.getBucket());
}
/**
* Get the transaction id from a base directory name.
* @param path the base directory name
* @return the maximum transaction id that is included
*/
static long parseBase(Path path) {
String filename = path.getName();
if (filename.startsWith(BASE_PREFIX)) {
return Long.parseLong(filename.substring(BASE_PREFIX.length()));
}
throw new IllegalArgumentException(filename + " does not start with " +
BASE_PREFIX);
}
/**
* Parse a bucket filename back into the options that would have created
* the file.
* @param bucketFile the path to a bucket file
* @param conf the configuration
* @return the options used to create that filename
*/
public static AcidOutputFormat.Options
parseBaseOrDeltaBucketFilename(Path bucketFile,
Configuration conf) {
AcidOutputFormat.Options result = new AcidOutputFormat.Options(conf);
String filename = bucketFile.getName();
if (ORIGINAL_PATTERN.matcher(filename).matches()) {
int bucket =
Integer.parseInt(filename.substring(0, filename.indexOf('_')));
result
.setOldStyle(true)
.minimumTransactionId(0)
.maximumTransactionId(0)
.bucket(bucket)
.writingBase(true);
} else if (filename.startsWith(BUCKET_PREFIX)) {
int bucket =
Integer.parseInt(filename.substring(filename.indexOf('_') + 1));
if (bucketFile.getParent().getName().startsWith(BASE_PREFIX)) {
result
.setOldStyle(false)
.minimumTransactionId(0)
.maximumTransactionId(parseBase(bucketFile.getParent()))
.bucket(bucket)
.writingBase(true);
} else if (bucketFile.getParent().getName().startsWith(DELTA_PREFIX)) {
ParsedDelta parsedDelta = parsedDelta(bucketFile.getParent(), DELTA_PREFIX);
result
.setOldStyle(false)
.minimumTransactionId(parsedDelta.minTransaction)
.maximumTransactionId(parsedDelta.maxTransaction)
.bucket(bucket);
} else if (bucketFile.getParent().getName().startsWith(DELETE_DELTA_PREFIX)) {
ParsedDelta parsedDelta = parsedDelta(bucketFile.getParent(), DELETE_DELTA_PREFIX);
result
.setOldStyle(false)
.minimumTransactionId(parsedDelta.minTransaction)
.maximumTransactionId(parsedDelta.maxTransaction)
.bucket(bucket);
}
} else {
result.setOldStyle(true).bucket(-1).minimumTransactionId(0)
.maximumTransactionId(0);
}
return result;
}
public enum Operation {
NOT_ACID, INSERT, UPDATE, DELETE;
}
/**
* Logically this should have been defined in Operation but that causes a dependency
* on metastore package from exec jar (from the cluster) which is not allowed.
* This method should only be called from client side where metastore.* classes are present.
* Not following this will not be caught by unit tests since they have all the jar loaded.
*/
public static DataOperationType toDataOperationType(Operation op) {
switch (op) {
case NOT_ACID:
return DataOperationType.UNSET;
case INSERT:
return DataOperationType.INSERT;
case UPDATE:
return DataOperationType.UPDATE;
case DELETE:
return DataOperationType.DELETE;
default:
throw new IllegalArgumentException("Unexpected Operation: " + op);
}
}
public enum AcidBaseFileType {
COMPACTED_BASE, // a regular base file generated through major compaction
ORIGINAL_BASE, // a non-acid schema file for tables that got converted to acid
INSERT_DELTA; // a delta file with only insert events that can be treated as base for split-update
}
/**
* A simple wrapper class that stores the information about a base file and its type.
* Orc splits can be generated on three kinds of base files: an original file (non-acid converted
* files), a regular base file (created by major compaction) or an insert delta (which can be
* treated as a base when split-update is enabled for acid).
*/
public static class AcidBaseFileInfo {
final private HdfsFileStatusWithId fileId;
final private AcidBaseFileType acidBaseFileType;
public AcidBaseFileInfo(HdfsFileStatusWithId fileId, AcidBaseFileType acidBaseFileType) {
this.fileId = fileId;
this.acidBaseFileType = acidBaseFileType;
}
public boolean isCompactedBase() {
return this.acidBaseFileType == AcidBaseFileType.COMPACTED_BASE;
}
public boolean isOriginal() {
return this.acidBaseFileType == AcidBaseFileType.ORIGINAL_BASE;
}
public boolean isInsertDelta() {
return this.acidBaseFileType == AcidBaseFileType.INSERT_DELTA;
}
public HdfsFileStatusWithId getHdfsFileStatusWithId() {
return this.fileId;
}
}
public static class AcidOperationalProperties {
private int description = 0x00;
public static final int SPLIT_UPDATE_BIT = 0x01;
public static final String SPLIT_UPDATE_STRING = "split_update";
public static final int HASH_BASED_MERGE_BIT = 0x02;
public static final String HASH_BASED_MERGE_STRING = "hash_merge";
public static final String DEFAULT_VALUE_STRING = TransactionalValidationListener.DEFAULT_TRANSACTIONAL_PROPERTY;
public static final String LEGACY_VALUE_STRING = TransactionalValidationListener.LEGACY_TRANSACTIONAL_PROPERTY;
private AcidOperationalProperties() {
}
/**
* Returns an acidOperationalProperties object that represents ACID behavior for legacy tables
* that were created before ACID type system using operational properties was put in place.
* @return the acidOperationalProperties object
*/
public static AcidOperationalProperties getLegacy() {
AcidOperationalProperties obj = new AcidOperationalProperties();
// In legacy mode, none of these properties are turned on.
return obj;
}
/**
* Returns an acidOperationalProperties object that represents default ACID behavior for tables
* that do no explicitly specify/override the default behavior.
* @return the acidOperationalProperties object.
*/
public static AcidOperationalProperties getDefault() {
AcidOperationalProperties obj = new AcidOperationalProperties();
obj.setSplitUpdate(true);
obj.setHashBasedMerge(false);
return obj;
}
/**
* Returns an acidOperationalProperties object that is represented by an encoded string.
* @param propertiesStr an encoded string representing the acidOperationalProperties.
* @return the acidOperationalProperties object.
*/
public static AcidOperationalProperties parseString(String propertiesStr) {
if (propertiesStr == null) {
return AcidOperationalProperties.getLegacy();
}
if (propertiesStr.equalsIgnoreCase(DEFAULT_VALUE_STRING)) {
return AcidOperationalProperties.getDefault();
}
if (propertiesStr.equalsIgnoreCase(LEGACY_VALUE_STRING)) {
return AcidOperationalProperties.getLegacy();
}
AcidOperationalProperties obj = new AcidOperationalProperties();
String[] options = propertiesStr.split("\\|");
for (String option : options) {
if (option.trim().length() == 0) continue; // ignore empty strings
switch (option) {
case SPLIT_UPDATE_STRING:
obj.setSplitUpdate(true);
break;
case HASH_BASED_MERGE_STRING:
obj.setHashBasedMerge(true);
break;
default:
throw new IllegalArgumentException(
"Unexpected value " + option + " for ACID operational properties!");
}
}
return obj;
}
/**
* Returns an acidOperationalProperties object that is represented by an encoded 32-bit integer.
* @param properties an encoded 32-bit representing the acidOperationalProperties.
* @return the acidOperationalProperties object.
*/
public static AcidOperationalProperties parseInt(int properties) {
AcidOperationalProperties obj = new AcidOperationalProperties();
if ((properties & SPLIT_UPDATE_BIT) > 0) {
obj.setSplitUpdate(true);
}
if ((properties & HASH_BASED_MERGE_BIT) > 0) {
obj.setHashBasedMerge(true);
}
return obj;
}
/**
* Sets the split update property for ACID operations based on the boolean argument.
* When split update is turned on, an update ACID event is interpreted as a combination of
* delete event followed by an update event.
* @param isSplitUpdate a boolean property that turns on split update when true.
* @return the acidOperationalProperties object.
*/
public AcidOperationalProperties setSplitUpdate(boolean isSplitUpdate) {
description = (isSplitUpdate
? (description | SPLIT_UPDATE_BIT) : (description & ~SPLIT_UPDATE_BIT));
return this;
}
/**
* Sets the hash-based merge property for ACID operations that combines delta files using
* GRACE hash join based approach, when turned on. (Currently unimplemented!)
* @param isHashBasedMerge a boolean property that turns on hash-based merge when true.
* @return the acidOperationalProperties object.
*/
public AcidOperationalProperties setHashBasedMerge(boolean isHashBasedMerge) {
description = (isHashBasedMerge
? (description | HASH_BASED_MERGE_BIT) : (description & ~HASH_BASED_MERGE_BIT));
return this;
}
public boolean isSplitUpdate() {
return (description & SPLIT_UPDATE_BIT) > 0;
}
public boolean isHashBasedMerge() {
return (description & HASH_BASED_MERGE_BIT) > 0;
}
public int toInt() {
return description;
}
@Override
public String toString() {
StringBuilder str = new StringBuilder();
if (isSplitUpdate()) {
str.append("|" + SPLIT_UPDATE_STRING);
}
if (isHashBasedMerge()) {
str.append("|" + HASH_BASED_MERGE_STRING);
}
return str.toString();
}
}
public static interface Directory {
/**
* Get the base directory.
* @return the base directory to read
*/
Path getBaseDirectory();
/**
* Get the list of original files. Not {@code null}.
* @return the list of original files (eg. 000000_0)
*/
List getOriginalFiles();
/**
* Get the list of base and delta directories that are valid and not
* obsolete. Not {@code null}. List must be sorted in a specific way.
* See {@link org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta#compareTo(org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta)}
* for details.
* @return the minimal list of current directories
*/
List getCurrentDirectories();
/**
* Get the list of obsolete directories. After filtering out bases and
* deltas that are not selected by the valid transaction list, return the
* list of original files, bases, and deltas that have been replaced by
* more up to date ones. Not {@code null}.
*/
List getObsolete();
}
public static class ParsedDelta implements Comparable {
private final long minTransaction;
private final long maxTransaction;
private final FileStatus path;
//-1 is for internal (getAcidState()) purposes and means the delta dir
//had no statement ID
private final int statementId;
private final boolean isDeleteDelta; // records whether delta dir is of type 'delete_delta_x_y...'
/**
* for pre 1.3.x delta files
*/
ParsedDelta(long min, long max, FileStatus path, boolean isDeleteDelta) {
this(min, max, path, -1, isDeleteDelta);
}
ParsedDelta(long min, long max, FileStatus path, int statementId, boolean isDeleteDelta) {
this.minTransaction = min;
this.maxTransaction = max;
this.path = path;
this.statementId = statementId;
this.isDeleteDelta = isDeleteDelta;
}
public long getMinTransaction() {
return minTransaction;
}
public long getMaxTransaction() {
return maxTransaction;
}
public Path getPath() {
return path.getPath();
}
public int getStatementId() {
return statementId == -1 ? 0 : statementId;
}
public boolean isDeleteDelta() {
return isDeleteDelta;
}
/**
* Compactions (Major/Minor) merge deltas/bases but delete of old files
* happens in a different process; thus it's possible to have bases/deltas with
* overlapping txnId boundaries. The sort order helps figure out the "best" set of files
* to use to get data.
* This sorts "wider" delta before "narrower" i.e. delta_5_20 sorts before delta_5_10 (and delta_11_20)
*/
@Override
public int compareTo(ParsedDelta parsedDelta) {
if (minTransaction != parsedDelta.minTransaction) {
if (minTransaction < parsedDelta.minTransaction) {
return -1;
} else {
return 1;
}
} else if (maxTransaction != parsedDelta.maxTransaction) {
if (maxTransaction < parsedDelta.maxTransaction) {
return 1;
} else {
return -1;
}
}
else if(statementId != parsedDelta.statementId) {
/**
* We want deltas after minor compaction (w/o statementId) to sort
* earlier so that getAcidState() considers compacted files (into larger ones) obsolete
* Before compaction, include deltas with all statementIds for a given txnId
* in a {@link org.apache.hadoop.hive.ql.io.AcidUtils.Directory}
*/
if(statementId < parsedDelta.statementId) {
return -1;
}
else {
return 1;
}
}
else {
return path.compareTo(parsedDelta.path);
}
}
}
/**
* Convert a list of deltas to a list of delta directories.
* @param deltas the list of deltas out of a Directory object.
* @return a list of delta directory paths that need to be read
*/
public static Path[] getPaths(List deltas) {
Path[] result = new Path[deltas.size()];
for(int i=0; i < result.length; ++i) {
result[i] = deltas.get(i).getPath();
}
return result;
}
/**
* Convert the list of deltas into an equivalent list of begin/end
* transaction id pairs. Assumes {@code deltas} is sorted.
* @param deltas
* @return the list of transaction ids to serialize
*/
public static List serializeDeltas(List deltas) {
List result = new ArrayList<>(deltas.size());
AcidInputFormat.DeltaMetaData last = null;
for(ParsedDelta parsedDelta : deltas) {
if(last != null && last.getMinTxnId() == parsedDelta.getMinTransaction() && last.getMaxTxnId() == parsedDelta.getMaxTransaction()) {
last.getStmtIds().add(parsedDelta.getStatementId());
continue;
}
last = new AcidInputFormat.DeltaMetaData(parsedDelta.getMinTransaction(), parsedDelta.getMaxTransaction(), new ArrayList());
result.add(last);
if(parsedDelta.statementId >= 0) {
last.getStmtIds().add(parsedDelta.getStatementId());
}
}
return result;
}
/**
* Convert the list of begin/end transaction id pairs to a list of delta
* directories. Note that there may be multiple delta files for the exact same txn range starting
* with 1.3.x;
* see {@link org.apache.hadoop.hive.ql.io.AcidUtils#deltaSubdir(long, long, int)}
* @param root the root directory
* @param deltas list of begin/end transaction id pairs
* @return the list of delta paths
*/
public static Path[] deserializeDeltas(Path root, final List deltas) throws IOException {
List results = new ArrayList(deltas.size());
for(AcidInputFormat.DeltaMetaData dmd : deltas) {
if(dmd.getStmtIds().isEmpty()) {
results.add(new Path(root, deltaSubdir(dmd.getMinTxnId(), dmd.getMaxTxnId())));
continue;
}
for(Integer stmtId : dmd.getStmtIds()) {
results.add(new Path(root, deltaSubdir(dmd.getMinTxnId(), dmd.getMaxTxnId(), stmtId)));
}
}
return results.toArray(new Path[results.size()]);
}
/**
* Convert the list of begin/end transaction id pairs to a list of delete delta
* directories. Note that there may be multiple delete_delta files for the exact same txn range starting
* with 2.2.x;
* see {@link org.apache.hadoop.hive.ql.io.AcidUtils#deltaSubdir(long, long, int)}
* @param root the root directory
* @param deleteDeltas list of begin/end transaction id pairs
* @return the list of delta paths
*/
public static Path[] deserializeDeleteDeltas(Path root, final List deleteDeltas) throws IOException {
List results = new ArrayList(deleteDeltas.size());
for(AcidInputFormat.DeltaMetaData dmd : deleteDeltas) {
if(dmd.getStmtIds().isEmpty()) {
results.add(new Path(root, deleteDeltaSubdir(dmd.getMinTxnId(), dmd.getMaxTxnId())));
continue;
}
for(Integer stmtId : dmd.getStmtIds()) {
results.add(new Path(root, deleteDeltaSubdir(dmd.getMinTxnId(), dmd.getMaxTxnId(), stmtId)));
}
}
return results.toArray(new Path[results.size()]);
}
public static ParsedDelta parsedDelta(Path deltaDir) {
String deltaDirName = deltaDir.getName();
if (deltaDirName.startsWith(DELETE_DELTA_PREFIX)) {
return parsedDelta(deltaDir, DELETE_DELTA_PREFIX);
}
return parsedDelta(deltaDir, DELTA_PREFIX); // default prefix is delta_prefix
}
private static ParsedDelta parseDelta(FileStatus path, String deltaPrefix) {
ParsedDelta p = parsedDelta(path.getPath(), deltaPrefix);
boolean isDeleteDelta = deltaPrefix.equals(DELETE_DELTA_PREFIX);
return new ParsedDelta(p.getMinTransaction(),
p.getMaxTransaction(), path, p.statementId, isDeleteDelta);
}
public static ParsedDelta parsedDelta(Path deltaDir, String deltaPrefix) {
String filename = deltaDir.getName();
boolean isDeleteDelta = deltaPrefix.equals(DELETE_DELTA_PREFIX);
if (filename.startsWith(deltaPrefix)) {
String rest = filename.substring(deltaPrefix.length());
int split = rest.indexOf('_');
int split2 = rest.indexOf('_', split + 1);//may be -1 if no statementId
long min = Long.parseLong(rest.substring(0, split));
long max = split2 == -1 ?
Long.parseLong(rest.substring(split + 1)) :
Long.parseLong(rest.substring(split + 1, split2));
if(split2 == -1) {
return new ParsedDelta(min, max, null, isDeleteDelta);
}
int statementId = Integer.parseInt(rest.substring(split2 + 1));
return new ParsedDelta(min, max, null, statementId, isDeleteDelta);
}
throw new IllegalArgumentException(deltaDir + " does not start with " +
deltaPrefix);
}
/**
* Is the given directory in ACID format?
* @param directory the partition directory to check
* @param conf the query configuration
* @return true, if it is an ACID directory
* @throws IOException
*/
public static boolean isAcid(Path directory,
Configuration conf) throws IOException {
FileSystem fs = directory.getFileSystem(conf);
for(FileStatus file: fs.listStatus(directory)) {
String filename = file.getPath().getName();
if (filename.startsWith(BASE_PREFIX) ||
filename.startsWith(DELTA_PREFIX) ||
filename.startsWith(DELETE_DELTA_PREFIX)) {
if (file.isDir()) {
return true;
}
}
}
return false;
}
@VisibleForTesting
public static Directory getAcidState(Path directory,
Configuration conf,
ValidTxnList txnList
) throws IOException {
return getAcidState(directory, conf, txnList, false, false);
}
/** State class for getChildState; cannot modify 2 things in a method. */
private static class TxnBase {
private FileStatus status;
private long txn = 0;
private long oldestBaseTxnId = Long.MAX_VALUE;
private Path oldestBase = null;
}
/**
* Get the ACID state of the given directory. It finds the minimal set of
* base and diff directories. Note that because major compactions don't
* preserve the history, we can't use a base directory that includes a
* transaction id that we must exclude.
* @param directory the partition directory to analyze
* @param conf the configuration
* @param txnList the list of transactions that we are reading
* @return the state of the directory
* @throws IOException
*/
public static Directory getAcidState(Path directory,
Configuration conf,
ValidTxnList txnList,
boolean useFileIds,
boolean ignoreEmptyFiles
) throws IOException {
return getAcidState(directory, conf, txnList, Ref.from(useFileIds), ignoreEmptyFiles);
}
public static Directory getAcidState(Path directory,
Configuration conf,
ValidTxnList txnList,
Ref useFileIds,
boolean ignoreEmptyFiles
) throws IOException {
FileSystem fs = directory.getFileSystem(conf);
// The following 'deltas' includes all kinds of delta files including insert & delete deltas.
final List deltas = new ArrayList();
List working = new ArrayList();
List originalDirectories = new ArrayList();
final List obsolete = new ArrayList();
List childrenWithId = null;
Boolean val = useFileIds.value;
if (val == null || val) {
try {
childrenWithId = SHIMS.listLocatedHdfsStatus(fs, directory, hiddenFileFilter);
if (val == null) {
useFileIds.value = true;
}
} catch (Throwable t) {
LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
if (val == null && t instanceof UnsupportedOperationException) {
useFileIds.value = false;
}
}
}
TxnBase bestBase = new TxnBase();
final List original = new ArrayList<>();
if (childrenWithId != null) {
for (HdfsFileStatusWithId child : childrenWithId) {
getChildState(child.getFileStatus(), child, txnList, working,
originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles);
}
} else {
List children = HdfsUtils.listLocatedStatus(fs, directory, hiddenFileFilter);
for (FileStatus child : children) {
getChildState(
child, null, txnList, working, originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles);
}
}
// If we have a base, the original files are obsolete.
if (bestBase.status != null) {
// Add original files to obsolete list if any
for (HdfsFileStatusWithId fswid : original) {
obsolete.add(fswid.getFileStatus());
}
// Add original direcotries to obsolete list if any
obsolete.addAll(originalDirectories);
// remove the entries so we don't get confused later and think we should
// use them.
original.clear();
originalDirectories.clear();
} else {
// Okay, we're going to need these originals. Recurse through them and figure out what we
// really need.
for (FileStatus origDir : originalDirectories) {
findOriginals(fs, origDir, original, useFileIds);
}
}
Collections.sort(working);
//so now, 'working' should be sorted like delta_5_20 delta_5_10 delta_11_20 delta_51_60 for example
//and we want to end up with the best set containing all relevant data: delta_5_20 delta_51_60,
//subject to list of 'exceptions' in 'txnList' (not show in above example).
long current = bestBase.txn;
int lastStmtId = -1;
ParsedDelta prev = null;
for(ParsedDelta next: working) {
if (next.maxTransaction > current) {
// are any of the new transactions ones that we care about?
if (txnList.isTxnRangeValid(current+1, next.maxTransaction) !=
ValidTxnList.RangeResponse.NONE) {
deltas.add(next);
current = next.maxTransaction;
lastStmtId = next.statementId;
prev = next;
}
}
else if(next.maxTransaction == current && lastStmtId >= 0) {
//make sure to get all deltas within a single transaction; multi-statement txn
//generate multiple delta files with the same txnId range
//of course, if maxTransaction has already been minor compacted, all per statement deltas are obsolete
deltas.add(next);
prev = next;
}
else if (prev != null && next.maxTransaction == prev.maxTransaction
&& next.minTransaction == prev.minTransaction
&& next.statementId == prev.statementId) {
// The 'next' parsedDelta may have everything equal to the 'prev' parsedDelta, except
// the path. This may happen when we have split update and we have two types of delta
// directories- 'delta_x_y' and 'delete_delta_x_y' for the SAME txn range.
// Also note that any delete_deltas in between a given delta_x_y range would be made
// obsolete. For example, a delta_30_50 would make delete_delta_40_40 obsolete.
// This is valid because minor compaction always compacts the normal deltas and the delete
// deltas for the same range. That is, if we had 3 directories, delta_30_30,
// delete_delta_40_40 and delta_50_50, then running minor compaction would produce
// delta_30_50 and delete_delta_30_50.
deltas.add(next);
prev = next;
}
else {
obsolete.add(next.path);
}
}
if(bestBase.oldestBase != null && bestBase.status == null) {
/**
* If here, it means there was a base_x (> 1 perhaps) but none were suitable for given
* {@link txnList}. Note that 'original' files are logically a base_Long.MIN_VALUE and thus
* cannot have any data for an open txn. We could check {@link deltas} has files to cover
* [1,n] w/o gaps but this would almost never happen...*/
long[] exceptions = txnList.getInvalidTransactions();
String minOpenTxn = exceptions != null && exceptions.length > 0 ?
Long.toString(exceptions[0]) : "x";
throw new IOException(ErrorMsg.ACID_NOT_ENOUGH_HISTORY.format(
Long.toString(txnList.getHighWatermark()),
minOpenTxn, bestBase.oldestBase.toString()));
}
final Path base = bestBase.status == null ? null : bestBase.status.getPath();
LOG.debug("in directory " + directory.toUri().toString() + " base = " + base + " deltas = " +
deltas.size());
return new Directory(){
@Override
public Path getBaseDirectory() {
return base;
}
@Override
public List getOriginalFiles() {
return original;
}
@Override
public List getCurrentDirectories() {
return deltas;
}
@Override
public List getObsolete() {
return obsolete;
}
};
}
/**
* We can only use a 'base' if it doesn't have an open txn (from specific reader's point of view)
* A 'base' with open txn in its range doesn't have 'enough history' info to produce a correct
* snapshot for this reader.
* Note that such base is NOT obsolete. Obsolete files are those that are "covered" by other
* files within the snapshot.
*/
private static boolean isValidBase(long baseTxnId, ValidTxnList txnList) {
if(baseTxnId == Long.MIN_VALUE) {
//such base is created by 1st compaction in case of non-acid to acid table conversion
//By definition there are no open txns with id < 1.
return true;
}
return txnList.isValidBase(baseTxnId);
}
private static void getChildState(FileStatus child, HdfsFileStatusWithId childWithId,
ValidTxnList txnList, List working, List originalDirectories,
List original, List obsolete, TxnBase bestBase,
boolean ignoreEmptyFiles) throws IOException {
Path p = child.getPath();
String fn = p.getName();
if (fn.startsWith(BASE_PREFIX) && child.isDir()) {
long txn = parseBase(p);
if(bestBase.oldestBaseTxnId > txn) {
//keep track for error reporting
bestBase.oldestBase = p;
bestBase.oldestBaseTxnId = txn;
}
if (bestBase.status == null) {
if(isValidBase(txn, txnList)) {
bestBase.status = child;
bestBase.txn = txn;
}
} else if (bestBase.txn < txn) {
if(isValidBase(txn, txnList)) {
obsolete.add(bestBase.status);
bestBase.status = child;
bestBase.txn = txn;
}
} else {
obsolete.add(child);
}
} else if ((fn.startsWith(DELTA_PREFIX) || fn.startsWith(DELETE_DELTA_PREFIX))
&& child.isDir()) {
String deltaPrefix =
(fn.startsWith(DELTA_PREFIX)) ? DELTA_PREFIX : DELETE_DELTA_PREFIX;
ParsedDelta delta = parseDelta(child, deltaPrefix);
if (txnList.isTxnRangeValid(delta.minTransaction,
delta.maxTransaction) !=
ValidTxnList.RangeResponse.NONE) {
working.add(delta);
}
} else if (child.isDir()) {
// This is just the directory. We need to recurse and find the actual files. But don't
// do this until we have determined there is no base. This saves time. Plus,
// it is possible that the cleaner is running and removing these original files,
// in which case recursing through them could cause us to get an error.
originalDirectories.add(child);
} else if (!ignoreEmptyFiles || child.getLen() != 0){
original.add(createOriginalObj(childWithId, child));
}
}
public static HdfsFileStatusWithId createOriginalObj(
HdfsFileStatusWithId childWithId, FileStatus child) {
return childWithId != null ? childWithId : new HdfsFileStatusWithoutId(child);
}
private static class HdfsFileStatusWithoutId implements HdfsFileStatusWithId {
private final FileStatus fs;
public HdfsFileStatusWithoutId(FileStatus fs) {
this.fs = fs;
}
@Override
public FileStatus getFileStatus() {
return fs;
}
@Override
public Long getFileId() {
return null;
}
}
/**
* Find the original files (non-ACID layout) recursively under the partition directory.
* @param fs the file system
* @param stat the directory to add
* @param original the list of original files
* @throws IOException
*/
private static void findOriginals(FileSystem fs, FileStatus stat,
List original, Ref useFileIds) throws IOException {
assert stat.isDir();
List childrenWithId = null;
Boolean val = useFileIds.value;
if (val == null || val) {
try {
childrenWithId = SHIMS.listLocatedHdfsStatus(fs, stat.getPath(), hiddenFileFilter);
if (val == null) {
useFileIds.value = true;
}
} catch (Throwable t) {
LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
if (val == null && t instanceof UnsupportedOperationException) {
useFileIds.value = false;
}
}
}
if (childrenWithId != null) {
for (HdfsFileStatusWithId child : childrenWithId) {
if (child.getFileStatus().isDir()) {
findOriginals(fs, child.getFileStatus(), original, useFileIds);
} else {
original.add(child);
}
}
} else {
List children = HdfsUtils.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter);
for (FileStatus child : children) {
if (child.isDir()) {
findOriginals(fs, child, original, useFileIds);
} else {
original.add(createOriginalObj(null, child));
}
}
}
}
public static boolean isTablePropertyTransactional(Properties props) {
String resultStr = props.getProperty(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL);
if (resultStr == null) {
resultStr = props.getProperty(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL.toUpperCase());
}
return resultStr != null && resultStr.equalsIgnoreCase("true");
}
public static boolean isTablePropertyTransactional(Map parameters) {
String resultStr = parameters.get(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL);
if (resultStr == null) {
resultStr = parameters.get(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL.toUpperCase());
}
return resultStr != null && resultStr.equalsIgnoreCase("true");
}
public static boolean isTablePropertyTransactional(Configuration conf) {
String resultStr = conf.get(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL);
if (resultStr == null) {
resultStr = conf.get(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL.toUpperCase());
}
return resultStr != null && resultStr.equalsIgnoreCase("true");
}
public static void setTransactionalTableScan(Map parameters, boolean isAcidTable) {
parameters.put(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, Boolean.toString(isAcidTable));
}
public static void setTransactionalTableScan(Configuration conf, boolean isAcidTable) {
HiveConf.setBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, isAcidTable);
}
/** Checks if a table is a valid ACID table.
* Note, users are responsible for using the correct TxnManager. We do not look at
* SessionState.get().getTxnMgr().supportsAcid() here
* @param table table
* @return true if table is a legit ACID table, false otherwise
*/
public static boolean isAcidTable(Table table) {
if (table == null) {
return false;
}
String tableIsTransactional = table.getProperty(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL);
if (tableIsTransactional == null) {
tableIsTransactional = table.getProperty(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL.toUpperCase());
}
return tableIsTransactional != null && tableIsTransactional.equalsIgnoreCase("true");
}
/**
* Sets the acidOperationalProperties in the configuration object argument.
* @param conf Mutable configuration object
* @param properties An acidOperationalProperties object to initialize from.
*/
public static void setAcidOperationalProperties(Configuration conf,
AcidOperationalProperties properties) {
if (properties != null) {
HiveConf.setIntVar(conf, ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES, properties.toInt());
}
}
/**
* Sets the acidOperationalProperties in the map object argument.
* @param parameters Mutable map object
* @param properties An acidOperationalProperties object to initialize from.
*/
public static void setAcidOperationalProperties(
Map parameters, AcidOperationalProperties properties) {
if (properties != null) {
parameters.put(ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname, properties.toString());
}
}
/**
* Returns the acidOperationalProperties for a given table.
* @param table A table object
* @return the acidOperationalProperties object for the corresponding table.
*/
public static AcidOperationalProperties getAcidOperationalProperties(Table table) {
String transactionalProperties = table.getProperty(
hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES);
if (transactionalProperties == null) {
// If the table does not define any transactional properties, we return a legacy type.
return AcidOperationalProperties.getLegacy();
}
return AcidOperationalProperties.parseString(transactionalProperties);
}
/**
* Returns the acidOperationalProperties for a given configuration.
* @param conf A configuration object
* @return the acidOperationalProperties object for the corresponding configuration.
*/
public static AcidOperationalProperties getAcidOperationalProperties(Configuration conf) {
// If the conf does not define any transactional properties, the parseInt() should receive
// a value of zero, which will set AcidOperationalProperties to a legacy type and return that.
return AcidOperationalProperties.parseInt(
HiveConf.getIntVar(conf, ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES));
}
/**
* Returns the acidOperationalProperties for a given set of properties.
* @param props A properties object
* @return the acidOperationalProperties object for the corresponding properties.
*/
public static AcidOperationalProperties getAcidOperationalProperties(Properties props) {
String resultStr = props.getProperty(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES);
if (resultStr == null) {
// If the properties does not define any transactional properties, we return a legacy type.
return AcidOperationalProperties.getLegacy();
}
return AcidOperationalProperties.parseString(resultStr);
}
/**
* Returns the acidOperationalProperties for a given map.
* @param parameters A parameters object
* @return the acidOperationalProperties object for the corresponding map.
*/
public static AcidOperationalProperties getAcidOperationalProperties(
Map parameters) {
String resultStr = parameters.get(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES);
if (resultStr == null) {
// If the parameters does not define any transactional properties, we return a legacy type.
return AcidOperationalProperties.getLegacy();
}
return AcidOperationalProperties.parseString(resultStr);
}
}