org.apache.hadoop.hive.ql.metadata.Partition Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.metadata;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.OutputFormat;
/**
* A Hive Table Partition: is a fundamental storage unit within a Table.
*
* Please note that the ql code should always go through methods of this class to access the
* metadata, instead of directly accessing org.apache.hadoop.hive.metastore.api.Partition.
* This helps to isolate the metastore code and the ql code.
*/
public class Partition implements Serializable {
@SuppressWarnings("nls")
private static final Logger LOG = LoggerFactory
.getLogger("hive.ql.metadata.Partition");
private Table table;
private org.apache.hadoop.hive.metastore.api.Partition tPartition;
/**
* These fields are cached. The information comes from tPartition.
*/
private Deserializer deserializer;
private Class extends OutputFormat> outputFormatClass;
private Class extends InputFormat> inputFormatClass;
/**
* @return The values of the partition
* @see org.apache.hadoop.hive.metastore.api.Partition#getValues()
*/
public List getValues() {
return tPartition.getValues();
}
/**
* Used only for serialization.
*/
public Partition() {
}
/**
* create an empty partition.
* SemanticAnalyzer code requires that an empty partition when the table is not partitioned.
*/
public Partition(Table tbl) throws HiveException {
org.apache.hadoop.hive.metastore.api.Partition tPart =
new org.apache.hadoop.hive.metastore.api.Partition();
if (!tbl.isView()) {
tPart.setSd(tbl.getTTable().getSd().deepCopy());
}
initialize(tbl, tPart);
}
public Partition(Table tbl, org.apache.hadoop.hive.metastore.api.Partition tp)
throws HiveException {
initialize(tbl, tp);
}
/**
* Create partition object with the given info.
*
* @param tbl
* Table the partition will be in.
* @param partSpec
* Partition specifications.
* @param location
* Location of the partition, relative to the table.
* @throws HiveException
* Thrown if we could not create the partition.
*/
public Partition(Table tbl, Map partSpec, Path location) throws HiveException {
initialize(tbl, createMetaPartitionObject(tbl, partSpec, location));
}
public static org.apache.hadoop.hive.metastore.api.Partition createMetaPartitionObject(
Table tbl, Map partSpec, Path location) throws HiveException {
List pvals = new ArrayList();
for (FieldSchema field : tbl.getPartCols()) {
String val = partSpec.get(field.getName());
if (val == null || val.isEmpty()) {
throw new HiveException("partition spec is invalid; field "
+ field.getName() + " does not exist or is empty");
}
pvals.add(val);
}
org.apache.hadoop.hive.metastore.api.Partition tpart =
new org.apache.hadoop.hive.metastore.api.Partition();
tpart.setDbName(tbl.getDbName());
tpart.setTableName(tbl.getTableName());
tpart.setValues(pvals);
if (!tbl.isView()) {
tpart.setSd(tbl.getSd().deepCopy());
tpart.getSd().setLocation((location != null) ? location.toString() : null);
}
return tpart;
}
/**
* Initializes this object with the given variables
*
* @param table
* Table the partition belongs to
* @param tPartition
* Thrift Partition object
* @throws HiveException
* Thrown if we cannot initialize the partition
*/
protected void initialize(Table table,
org.apache.hadoop.hive.metastore.api.Partition tPartition) throws HiveException {
this.table = table;
setTPartition(tPartition);
if (table.isView()) {
return;
}
if (table.isPartitioned()) {
try {
if (tPartition.getSd().getLocation() == null) {
// set default if location is not set and this is a physical
// table partition (not a view partition)
if (table.getDataLocation() != null) {
Path partPath = new Path(table.getDataLocation(), Warehouse.makePartName(table.getPartCols(), tPartition.getValues()));
tPartition.getSd().setLocation(partPath.toString());
}
}
// set default if columns are not set
if (tPartition.getSd().getCols() == null) {
if (table.getCols() != null) {
tPartition.getSd().setCols(table.getCols());
}
}
} catch (MetaException e) {
throw new HiveException("Invalid partition for table " + table.getTableName(),
e);
}
}
// Note that we do not set up fields like inputFormatClass, outputFormatClass
// and deserializer because the Partition needs to be accessed from across
// the metastore side as well, which will result in attempting to load
// the class associated with them, which might not be available, and
// the main reason to instantiate them would be to pre-cache them for
// performance. Since those fields are null/cache-check by their accessors
// anyway, that's not a concern.
}
public String getName() {
try {
return Warehouse.makePartName(table.getPartCols(), tPartition.getValues());
} catch (MetaException e) {
throw new RuntimeException(e);
}
}
public Path[] getPath() {
Path[] ret = new Path[]{getDataLocation()};
return ret;
}
public Path getPartitionPath() {
return getDataLocation();
}
public Path getDataLocation() {
if (table.isPartitioned()) {
return new Path(tPartition.getSd().getLocation());
} else {
return new Path(table.getTTable().getSd().getLocation());
}
}
final public Deserializer getDeserializer() {
if (deserializer == null) {
try {
deserializer = MetaStoreUtils.getDeserializer(SessionState.getSessionConf(),
tPartition, table.getTTable());
} catch (MetaException e) {
throw new RuntimeException(e);
}
}
return deserializer;
}
public Properties getSchema() {
return MetaStoreUtils.getSchema(tPartition, table.getTTable());
}
public Properties getMetadataFromPartitionSchema() {
return MetaStoreUtils.getPartitionMetadata(tPartition, table.getTTable());
}
public Properties getSchemaFromTableSchema(Properties tblSchema) {
return MetaStoreUtils.getPartSchemaFromTableSchema(tPartition.getSd(), table.getTTable().getSd(),
tPartition.getParameters(), table.getDbName(), table.getTableName(), table.getPartitionKeys(),
tblSchema);
}
/**
* @param inputFormatClass
*/
public void setInputFormatClass(Class extends InputFormat> inputFormatClass) {
this.inputFormatClass = inputFormatClass;
tPartition.getSd().setInputFormat(inputFormatClass.getName());
}
/**
* @param outputFormatClass
*/
public void setOutputFormatClass(Class extends HiveOutputFormat> outputFormatClass) {
this.outputFormatClass = outputFormatClass;
tPartition.getSd().setOutputFormat(HiveFileFormatUtils
.getOutputFormatSubstitute(outputFormatClass).getName());
}
final public Class extends InputFormat> getInputFormatClass()
throws HiveException {
if (inputFormatClass == null) {
// sd can be null for views
String clsName = tPartition.getSd() == null ? null : tPartition.getSd().getInputFormat();
if (clsName == null) {
return inputFormatClass = table.getInputFormatClass();
}
try {
inputFormatClass = ((Class extends InputFormat>) Class.forName(clsName, true,
Utilities.getSessionSpecifiedClassLoader()));
} catch (ClassNotFoundException e) {
throw new HiveException("Class not found: " + clsName, e);
}
}
return inputFormatClass;
}
final public Class extends OutputFormat> getOutputFormatClass()
throws HiveException {
if (outputFormatClass == null) {
// sd can be null for views
String clsName = tPartition.getSd() == null ? null : tPartition.getSd().getOutputFormat();
if (clsName == null) {
return outputFormatClass = table.getOutputFormatClass();
}
try {
Class> c = Class.forName(clsName, true, Utilities.getSessionSpecifiedClassLoader());
// Replace FileOutputFormat for backward compatibility
outputFormatClass = HiveFileFormatUtils.getOutputFormatSubstitute(c);
} catch (ClassNotFoundException e) {
throw new HiveException("Class not found: " + clsName, e);
}
}
return outputFormatClass;
}
public int getBucketCount() {
return tPartition.getSd().getNumBuckets();
/*
* TODO: Keeping this code around for later use when we will support
* sampling on tables which are not created with CLUSTERED INTO clause
*
* // read from table meta data int numBuckets = this.table.getNumBuckets();
* if (numBuckets == -1) { // table meta data does not have bucket
* information // check if file system has multiple buckets(files) in this
* partition String pathPattern = this.partPath.toString() + "/*"; try {
* FileSystem fs = FileSystem.get(this.table.getDataLocation(),
* Hive.get().getConf()); FileStatus srcs[] = fs.globStatus(new
* Path(pathPattern), FileUtils.HIDDEN_FILES_PATH_FILTER); numBuckets = srcs.length; } catch (Exception e) {
* throw new RuntimeException("Cannot get bucket count for table " +
* this.table.getName(), e); } } return numBuckets;
*/
}
public void setBucketCount(int newBucketNum) {
tPartition.getSd().setNumBuckets(newBucketNum);
}
public List getBucketCols() {
return tPartition.getSd().getBucketCols();
}
public List getSortCols() {
return tPartition.getSd().getSortCols();
}
public List getSortColNames() {
return Utilities.getColumnNamesFromSortCols(getSortCols());
}
/**
* get all paths for this partition in a sorted manner
*/
@SuppressWarnings("nls")
public FileStatus[] getSortedPaths() {
try {
// Previously, this got the filesystem of the Table, which could be
// different from the filesystem of the partition.
FileSystem fs = getDataLocation().getFileSystem(SessionState.getSessionConf());
String pathPattern = getDataLocation().toString();
if (getBucketCount() > 0) {
pathPattern = pathPattern + "/*";
}
LOG.info("Path pattern = " + pathPattern);
FileStatus srcs[] = fs.globStatus(new Path(pathPattern), FileUtils.HIDDEN_FILES_PATH_FILTER);
Arrays.sort(srcs);
for (FileStatus src : srcs) {
LOG.info("Got file: " + src.getPath());
}
if (srcs.length == 0) {
return null;
}
return srcs;
} catch (Exception e) {
throw new RuntimeException("Cannot get path ", e);
}
}
/**
* mapping from bucket number to bucket path
*/
// TODO: add test case and clean it up
@SuppressWarnings("nls")
public Path getBucketPath(int bucketNum) {
FileStatus srcs[] = getSortedPaths();
if (srcs == null) {
return null;
}
return srcs[bucketNum].getPath();
}
@SuppressWarnings("nls")
public Path[] getPath(Sample s) throws HiveException {
if (s == null) {
return getPath();
} else {
int bcount = getBucketCount();
if (bcount == 0) {
return getPath();
}
Dimension d = s.getSampleDimension();
if (!d.getDimensionId().equals(table.getBucketingDimensionId())) {
// if the bucket dimension is not the same as the sampling dimension
// we must scan all the data
return getPath();
}
int scount = s.getSampleFraction();
ArrayList ret = new ArrayList();
if (bcount == scount) {
ret.add(getBucketPath(s.getSampleNum() - 1));
} else if (bcount < scount) {
if ((scount / bcount) * bcount != scount) {
throw new HiveException("Sample Count" + scount
+ " is not a multiple of bucket count " + bcount + " for table "
+ table.getTableName());
}
// undersampling a bucket
ret.add(getBucketPath((s.getSampleNum() - 1) % bcount));
} else if (bcount > scount) {
if ((bcount / scount) * scount != bcount) {
throw new HiveException("Sample Count" + scount
+ " is not a divisor of bucket count " + bcount + " for table "
+ table.getTableName());
}
// sampling multiple buckets
for (int i = 0; i < bcount / scount; i++) {
ret.add(getBucketPath(i * scount + (s.getSampleNum() - 1)));
}
}
return (ret.toArray(new Path[ret.size()]));
}
}
public LinkedHashMap getSpec() {
return table.createSpec(tPartition);
}
@SuppressWarnings("nls")
@Override
public String toString() {
String pn = "Invalid Partition";
try {
pn = Warehouse.makePartName(getSpec(), false);
} catch (MetaException e) {
// ignore as we most probably in an exception path already otherwise this
// error wouldn't occur
}
return table.toString() + "(" + pn + ")";
}
public Table getTable() {
return table;
}
/**
* Should be only used by serialization.
*/
public void setTable(Table table) {
this.table = table;
}
/**
* Should be only used by serialization.
*/
public org.apache.hadoop.hive.metastore.api.Partition getTPartition() {
return tPartition;
}
/**
* Should be only used by serialization.
*/
public void setTPartition(
org.apache.hadoop.hive.metastore.api.Partition partition) {
StringInternUtils.internStringsInList(partition.getValues());
tPartition = partition;
}
public Map getParameters() {
return tPartition.getParameters();
}
public List getCols() {
return getColsInternal(false);
}
public List getColsForMetastore() {
return getColsInternal(true);
}
private List getColsInternal(boolean forMs) {
try {
String serializationLib = tPartition.getSd().getSerdeInfo().getSerializationLib();
// Do the lightweight check for general case.
if (Table.hasMetastoreBasedSchema(SessionState.getSessionConf(), serializationLib)) {
return tPartition.getSd().getCols();
} else if (forMs && !Table.shouldStoreFieldsInMetastore(
SessionState.getSessionConf(), serializationLib, table.getParameters())) {
return Hive.getFieldsFromDeserializerForMsStorage(table, getDeserializer());
}
return MetaStoreUtils.getFieldsFromDeserializer(table.getTableName(), getDeserializer());
} catch (Exception e) {
LOG.error("Unable to get cols from serde: " +
tPartition.getSd().getSerdeInfo().getSerializationLib(), e);
}
return new ArrayList();
}
public String getLocation() {
if (tPartition.getSd() == null) {
return null;
} else {
return tPartition.getSd().getLocation();
}
}
public void setLocation(String location) {
tPartition.getSd().setLocation(location);
}
/**
* Set Partition's values
*
* @param partSpec
* Partition specifications.
* @throws HiveException
* Thrown if we could not create the partition.
*/
public void setValues(Map partSpec)
throws HiveException {
List pvals = new ArrayList();
for (FieldSchema field : table.getPartCols()) {
String val = partSpec.get(field.getName());
if (val == null) {
throw new HiveException(
"partition spec is invalid. field.getName() does not exist in input.");
}
pvals.add(val.intern());
}
tPartition.setValues(pvals);
}
/**
* @return include the db name
*/
public String getCompleteName() {
return getTable().getCompleteName() + "@" + getName();
}
public int getLastAccessTime() {
return tPartition.getLastAccessTime();
}
public void setLastAccessTime(int lastAccessTime) {
tPartition.setLastAccessTime(lastAccessTime);
}
public boolean isStoredAsSubDirectories() {
return tPartition.getSd().isStoredAsSubDirectories();
}
public List> getSkewedColValues(){
return tPartition.getSd().getSkewedInfo().getSkewedColValues();
}
public List getSkewedColNames() {
LOG.debug("sd is " + tPartition.getSd().getClass().getName());
return tPartition.getSd().getSkewedInfo().getSkewedColNames();
}
public void setSkewedValueLocationMap(List valList, String dirName)
throws HiveException {
Map, String> mappings = tPartition.getSd().getSkewedInfo()
.getSkewedColValueLocationMaps();
if (null == mappings) {
mappings = new HashMap, String>();
tPartition.getSd().getSkewedInfo().setSkewedColValueLocationMaps(mappings);
}
// Add or update new mapping
mappings.put(valList, dirName);
}
public Map, String> getSkewedColValueLocationMaps() {
return tPartition.getSd().getSkewedInfo().getSkewedColValueLocationMaps();
}
public void checkValidity() throws HiveException {
if (!tPartition.getSd().equals(table.getSd())) {
Table.validateColumns(getCols(), table.getPartCols());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy