org.apache.hadoop.hive.ql.io.AcidUtils Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.shims.HadoopShims;
import org.apache.hadoop.hive.shims.ShimLoader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
/**
* Utilities that are shared by all of the ACID input and output formats. They
* are used by the compactor and cleaner and thus must be format agnostic.
*/
public class AcidUtils {
// This key will be put in the conf file when planning an acid operation
public static final String CONF_ACID_KEY = "hive.doing.acid";
public static final String BASE_PREFIX = "base_";
public static final String DELTA_PREFIX = "delta_";
public static final PathFilter deltaFileFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(DELTA_PREFIX);
}
};
public static final String BUCKET_PREFIX = "bucket_";
public static final PathFilter bucketFileFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(BUCKET_PREFIX);
}
};
public static final String BUCKET_DIGITS = "%05d";
public static final String DELTA_DIGITS = "%07d";
public static final Pattern BUCKET_DIGIT_PATTERN = Pattern.compile("[0-9]{5}$");
public static final Pattern LEGACY_BUCKET_DIGIT_PATTERN = Pattern.compile("^[0-9]{5}");
public static final PathFilter originalBucketFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return ORIGINAL_PATTERN.matcher(path.getName()).matches();
}
};
private AcidUtils() {
// NOT USED
}
private static final Log LOG = LogFactory.getLog(AcidUtils.class.getName());
private static final Pattern ORIGINAL_PATTERN =
Pattern.compile("[0-9]+_[0-9]+");
public static final PathFilter hiddenFileFilter = new PathFilter(){
public boolean accept(Path p){
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
private static final HadoopShims SHIMS = ShimLoader.getHadoopShims();
/**
* Create the bucket filename.
* @param subdir the subdirectory for the bucket.
* @param bucket the bucket number
* @return the filename
*/
public static Path createBucketFile(Path subdir, int bucket) {
return new Path(subdir,
BUCKET_PREFIX + String.format(BUCKET_DIGITS, bucket));
}
private static String deltaSubdir(long min, long max) {
return DELTA_PREFIX + String.format(DELTA_DIGITS, min) + "_" +
String.format(DELTA_DIGITS, max);
}
/**
* Create a filename for a bucket file.
* @param directory the partition directory
* @param options the options for writing the bucket
* @return the filename that should store the bucket
*/
public static Path createFilename(Path directory,
AcidOutputFormat.Options options) {
String subdir;
if (options.getOldStyle()) {
return new Path(directory, String.format(BUCKET_DIGITS,
options.getBucket()) + "_0");
} else if (options.isWritingBase()) {
subdir = BASE_PREFIX + String.format(DELTA_DIGITS,
options.getMaximumTransactionId());
} else {
subdir = deltaSubdir(options.getMinimumTransactionId(),
options.getMaximumTransactionId());
}
return createBucketFile(new Path(directory, subdir), options.getBucket());
}
/**
* Get the transaction id from a base directory name.
* @param path the base directory name
* @return the maximum transaction id that is included
*/
static long parseBase(Path path) {
String filename = path.getName();
if (filename.startsWith(BASE_PREFIX)) {
return Long.parseLong(filename.substring(BASE_PREFIX.length()));
}
throw new IllegalArgumentException(filename + " does not start with " +
BASE_PREFIX);
}
/**
* Parse a bucket filename back into the options that would have created
* the file.
* @param bucketFile the path to a bucket file
* @param conf the configuration
* @return the options used to create that filename
*/
public static AcidOutputFormat.Options
parseBaseBucketFilename(Path bucketFile,
Configuration conf) {
AcidOutputFormat.Options result = new AcidOutputFormat.Options(conf);
String filename = bucketFile.getName();
result.writingBase(true);
if (ORIGINAL_PATTERN.matcher(filename).matches()) {
int bucket =
Integer.parseInt(filename.substring(0, filename.indexOf('_')));
result
.setOldStyle(true)
.minimumTransactionId(0)
.maximumTransactionId(0)
.bucket(bucket);
} else if (filename.startsWith(BUCKET_PREFIX)) {
int bucket =
Integer.parseInt(filename.substring(filename.indexOf('_') + 1));
result
.setOldStyle(false)
.minimumTransactionId(0)
.maximumTransactionId(parseBase(bucketFile.getParent()))
.bucket(bucket);
} else {
result.setOldStyle(true).bucket(-1).minimumTransactionId(0)
.maximumTransactionId(0);
}
return result;
}
public enum Operation { NOT_ACID, INSERT, UPDATE, DELETE }
public static interface Directory {
/**
* Get the base directory.
* @return the base directory to read
*/
Path getBaseDirectory();
/**
* Get the list of original files.
* @return the list of original files (eg. 000000_0)
*/
List getOriginalFiles();
/**
* Get the list of base and delta directories that are valid and not
* obsolete.
* @return the minimal list of current directories
*/
List getCurrentDirectories();
/**
* Get the list of obsolete directories. After filtering out bases and
* deltas that are not selected by the valid transaction list, return the
* list of original files, bases, and deltas that have been replaced by
* more up to date ones.
*/
List getObsolete();
}
public static class ParsedDelta implements Comparable {
final long minTransaction;
final long maxTransaction;
final FileStatus path;
ParsedDelta(long min, long max, FileStatus path) {
this.minTransaction = min;
this.maxTransaction = max;
this.path = path;
}
public long getMinTransaction() {
return minTransaction;
}
public long getMaxTransaction() {
return maxTransaction;
}
public Path getPath() {
return path.getPath();
}
@Override
public int compareTo(ParsedDelta parsedDelta) {
if (minTransaction != parsedDelta.minTransaction) {
if (minTransaction < parsedDelta.minTransaction) {
return -1;
} else {
return 1;
}
} else if (maxTransaction != parsedDelta.maxTransaction) {
if (maxTransaction < parsedDelta.maxTransaction) {
return 1;
} else {
return -1;
}
} else {
return path.compareTo(parsedDelta.path);
}
}
}
/**
* Convert a list of deltas to a list of delta directories.
* @param deltas the list of deltas out of a Directory object.
* @return a list of delta directory paths that need to be read
*/
public static Path[] getPaths(List deltas) {
Path[] result = new Path[deltas.size()];
for(int i=0; i < result.length; ++i) {
result[i] = deltas.get(i).getPath();
}
return result;
}
/**
* Convert the list of deltas into an equivalent list of begin/end
* transaction id pairs.
* @param deltas
* @return the list of transaction ids to serialize
*/
public static List serializeDeltas(List deltas) {
List result = new ArrayList(deltas.size() * 2);
for(ParsedDelta delta: deltas) {
result.add(delta.minTransaction);
result.add(delta.maxTransaction);
}
return result;
}
/**
* Convert the list of begin/end transaction id pairs to a list of delta
* directories.
* @param root the root directory
* @param deltas list of begin/end transaction id pairs
* @return the list of delta paths
*/
public static Path[] deserializeDeltas(Path root, List deltas) {
int deltaSize = deltas.size() / 2;
Path[] result = new Path[deltaSize];
for(int i = 0; i < deltaSize; ++i) {
result[i] = new Path(root, deltaSubdir(deltas.get(i * 2),
deltas.get(i * 2 + 1)));
}
return result;
}
static ParsedDelta parseDelta(FileStatus path) {
String filename = path.getPath().getName();
if (filename.startsWith(DELTA_PREFIX)) {
String rest = filename.substring(DELTA_PREFIX.length());
int split = rest.indexOf('_');
long min = Long.parseLong(rest.substring(0, split));
long max = Long.parseLong(rest.substring(split + 1));
return new ParsedDelta(min, max, path);
}
throw new IllegalArgumentException(path + " does not start with " +
DELTA_PREFIX);
}
/**
* Is the given directory in ACID format?
* @param directory the partition directory to check
* @param conf the query configuration
* @return true, if it is an ACID directory
* @throws IOException
*/
public static boolean isAcid(Path directory,
Configuration conf) throws IOException {
FileSystem fs = directory.getFileSystem(conf);
for(FileStatus file: fs.listStatus(directory)) {
String filename = file.getPath().getName();
if (filename.startsWith(BASE_PREFIX) ||
filename.startsWith(DELTA_PREFIX)) {
if (file.isDir()) {
return true;
}
}
}
return false;
}
/**
* Get the ACID state of the given directory. It finds the minimal set of
* base and diff directories. Note that because major compactions don't
* preserve the history, we can't use a base directory that includes a
* transaction id that we must exclude.
* @param directory the partition directory to analyze
* @param conf the configuration
* @param txnList the list of transactions that we are reading
* @return the state of the directory
* @throws IOException
*/
public static Directory getAcidState(Path directory,
Configuration conf,
ValidTxnList txnList
) throws IOException {
FileSystem fs = directory.getFileSystem(conf);
FileStatus bestBase = null;
long bestBaseTxn = 0;
final List deltas = new ArrayList();
List working = new ArrayList();
List originalDirectories = new ArrayList();
final List obsolete = new ArrayList();
List children = SHIMS.listLocatedStatus(fs, directory,
hiddenFileFilter);
for(FileStatus child: children) {
Path p = child.getPath();
String fn = p.getName();
if (fn.startsWith(BASE_PREFIX) && child.isDir()) {
long txn = parseBase(p);
if (bestBase == null) {
bestBase = child;
bestBaseTxn = txn;
} else if (bestBaseTxn < txn) {
obsolete.add(bestBase);
bestBase = child;
bestBaseTxn = txn;
} else {
obsolete.add(child);
}
} else if (fn.startsWith(DELTA_PREFIX) && child.isDir()) {
ParsedDelta delta = parseDelta(child);
if (txnList.isTxnRangeCommitted(delta.minTransaction,
delta.maxTransaction) !=
ValidTxnList.RangeResponse.NONE) {
working.add(delta);
}
} else {
// This is just the directory. We need to recurse and find the actual files. But don't
// do this until we have determined there is no base. This saves time. Plus,
// it is possible that the cleaner is running and removing these original files,
// in which case recursing through them could cause us to get an error.
originalDirectories.add(child);
}
}
final List original = new ArrayList();
// if we have a base, the original files are obsolete.
if (bestBase != null) {
// remove the entries so we don't get confused later and think we should
// use them.
original.clear();
} else {
// Okay, we're going to need these originals. Recurse through them and figure out what we
// really need.
for (FileStatus origDir : originalDirectories) {
findOriginals(fs, origDir, original);
}
}
Collections.sort(working);
long current = bestBaseTxn;
for(ParsedDelta next: working) {
if (next.maxTransaction > current) {
// are any of the new transactions ones that we care about?
if (txnList.isTxnRangeCommitted(current+1, next.maxTransaction) !=
ValidTxnList.RangeResponse.NONE) {
deltas.add(next);
current = next.maxTransaction;
}
} else {
obsolete.add(next.path);
}
}
final Path base = bestBase == null ? null : bestBase.getPath();
LOG.debug("in directory " + directory.toUri().toString() + " base = " + base + " deltas = " +
deltas.size());
return new Directory(){
@Override
public Path getBaseDirectory() {
return base;
}
@Override
public List getOriginalFiles() {
return original;
}
@Override
public List getCurrentDirectories() {
return deltas;
}
@Override
public List getObsolete() {
return obsolete;
}
};
}
/**
* Find the original files (non-ACID layout) recursively under the partition
* directory.
* @param fs the file system
* @param stat the file/directory to add
* @param original the list of original files
* @throws IOException
*/
private static void findOriginals(FileSystem fs, FileStatus stat,
List original
) throws IOException {
if (stat.isDir()) {
for(FileStatus child: SHIMS.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter)) {
findOriginals(fs, child, original);
}
} else {
original.add(stat);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy