io.trino.plugin.hive.util.HiveWriteUtils Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive.util;
import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableList;
import com.google.common.primitives.Shorts;
import com.google.common.primitives.SignedBytes;
import io.trino.plugin.hive.HdfsEnvironment;
import io.trino.plugin.hive.HdfsEnvironment.HdfsContext;
import io.trino.plugin.hive.HiveReadOnlyException;
import io.trino.plugin.hive.HiveTimestampPrecision;
import io.trino.plugin.hive.HiveType;
import io.trino.plugin.hive.avro.AvroRecordWriter;
import io.trino.plugin.hive.metastore.Database;
import io.trino.plugin.hive.metastore.Partition;
import io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore;
import io.trino.plugin.hive.metastore.Storage;
import io.trino.plugin.hive.metastore.Table;
import io.trino.plugin.hive.parquet.ParquetRecordWriter;
import io.trino.plugin.hive.rubix.CachingTrinoS3FileSystem;
import io.trino.plugin.hive.s3.TrinoS3FileSystem;
import io.trino.spi.Page;
import io.trino.spi.StandardErrorCode;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.SchemaNotFoundException;
import io.trino.spi.connector.SchemaTableName;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.CharType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.Int128;
import io.trino.spi.type.LongTimestamp;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.Type;
import io.trino.spi.type.VarcharType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FilterFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.viewfs.ViewFileSystem;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.ProtectMode;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.joda.time.DateTimeZone;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.io.BaseEncoding.base16;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_DATA_ERROR;
import static io.trino.plugin.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION;
import static io.trino.plugin.hive.HiveSessionProperties.getTemporaryStagingDirectoryPath;
import static io.trino.plugin.hive.metastore.MetastoreUtil.getProtectMode;
import static io.trino.plugin.hive.metastore.MetastoreUtil.verifyOnline;
import static io.trino.plugin.hive.s3.HiveS3Module.EMR_FS_CLASS_NAME;
import static io.trino.plugin.hive.util.HiveUtil.checkCondition;
import static io.trino.plugin.hive.util.HiveUtil.isArrayType;
import static io.trino.plugin.hive.util.HiveUtil.isMapType;
import static io.trino.plugin.hive.util.HiveUtil.isRowType;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.Chars.padSpaces;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND;
import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_SECOND;
import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND;
import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MICROSECOND;
import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.VarbinaryType.VARBINARY;
import static java.lang.Float.intBitsToFloat;
import static java.lang.Math.floorDiv;
import static java.lang.Math.floorMod;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Collections.unmodifiableList;
import static java.util.Collections.unmodifiableMap;
import static java.util.UUID.randomUUID;
import static java.util.stream.Collectors.toList;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT;
import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE;
import static org.apache.hadoop.hive.metastore.TableType.MATERIALIZED_VIEW;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBinaryObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBooleanObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableByteObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDateObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableHiveCharObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableIntObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableLongObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableShortObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableStringObjectInspector;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableTimestampObjectInspector;
import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo;
import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getVarcharTypeInfo;
public final class HiveWriteUtils
{
private HiveWriteUtils()
{
}
public static RecordWriter createRecordWriter(Path target, JobConf conf, Properties properties, String outputFormatName, ConnectorSession session)
{
return createRecordWriter(target, conf, properties, outputFormatName, session, Optional.empty());
}
public static RecordWriter createRecordWriter(Path target, JobConf conf, Properties properties, String outputFormatName, ConnectorSession session, Optional textHeaderWriter)
{
try {
boolean compress = HiveConf.getBoolVar(conf, COMPRESSRESULT);
if (outputFormatName.equals(MapredParquetOutputFormat.class.getName())) {
return ParquetRecordWriter.create(target, conf, properties, session);
}
if (outputFormatName.equals(HiveIgnoreKeyTextOutputFormat.class.getName())) {
return new TextRecordWriter(target, conf, properties, compress, textHeaderWriter);
}
if (outputFormatName.equals(HiveSequenceFileOutputFormat.class.getName())) {
return new SequenceFileRecordWriter(target, conf, Text.class, compress);
}
if (outputFormatName.equals(AvroContainerOutputFormat.class.getName())) {
return new AvroRecordWriter(target, conf, compress, properties);
}
Object writer = Class.forName(outputFormatName).getConstructor().newInstance();
return ((HiveOutputFormat, ?>) writer).getHiveRecordWriter(conf, target, Text.class, compress, properties, Reporter.NULL);
}
catch (IOException | ReflectiveOperationException e) {
throw new TrinoException(HIVE_WRITER_DATA_ERROR, e);
}
}
public static Serializer initializeSerializer(Configuration conf, Properties properties, String serializerName)
{
try {
Serializer result = (Serializer) Class.forName(serializerName).getConstructor().newInstance();
result.initialize(conf, properties);
return result;
}
catch (ClassNotFoundException e) {
throw new TrinoException(HIVE_SERDE_NOT_FOUND, "Serializer does not exist: " + serializerName);
}
catch (SerDeException | ReflectiveOperationException e) {
throw new TrinoException(HIVE_WRITER_DATA_ERROR, e);
}
}
public static ObjectInspector getJavaObjectInspector(Type type)
{
if (type.equals(BOOLEAN)) {
return javaBooleanObjectInspector;
}
if (type.equals(BIGINT)) {
return javaLongObjectInspector;
}
if (type.equals(INTEGER)) {
return javaIntObjectInspector;
}
if (type.equals(SMALLINT)) {
return javaShortObjectInspector;
}
if (type.equals(TINYINT)) {
return javaByteObjectInspector;
}
if (type.equals(REAL)) {
return javaFloatObjectInspector;
}
if (type.equals(DOUBLE)) {
return javaDoubleObjectInspector;
}
if (type instanceof VarcharType) {
return writableStringObjectInspector;
}
if (type instanceof CharType) {
return writableHiveCharObjectInspector;
}
if (type.equals(VARBINARY)) {
return javaByteArrayObjectInspector;
}
if (type.equals(DATE)) {
return javaDateObjectInspector;
}
if (type instanceof TimestampType) {
return javaTimestampObjectInspector;
}
if (type instanceof DecimalType) {
DecimalType decimalType = (DecimalType) type;
return getPrimitiveJavaObjectInspector(new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale()));
}
if (isArrayType(type)) {
return ObjectInspectorFactory.getStandardListObjectInspector(getJavaObjectInspector(type.getTypeParameters().get(0)));
}
if (isMapType(type)) {
ObjectInspector keyObjectInspector = getJavaObjectInspector(type.getTypeParameters().get(0));
ObjectInspector valueObjectInspector = getJavaObjectInspector(type.getTypeParameters().get(1));
return ObjectInspectorFactory.getStandardMapObjectInspector(keyObjectInspector, valueObjectInspector);
}
if (isRowType(type)) {
return ObjectInspectorFactory.getStandardStructObjectInspector(
type.getTypeSignature().getParameters().stream()
.map(parameter -> parameter.getNamedTypeSignature().getName().get())
.collect(toImmutableList()),
type.getTypeParameters().stream()
.map(HiveWriteUtils::getJavaObjectInspector)
.collect(toImmutableList()));
}
throw new IllegalArgumentException("unsupported type: " + type);
}
public static List createPartitionValues(List partitionColumnTypes, Page partitionColumns, int position)
{
ImmutableList.Builder partitionValues = ImmutableList.builder();
for (int field = 0; field < partitionColumns.getChannelCount(); field++) {
Object value = getField(DateTimeZone.UTC, partitionColumnTypes.get(field), partitionColumns.getBlock(field), position);
if (value == null) {
partitionValues.add(HIVE_DEFAULT_DYNAMIC_PARTITION);
}
else {
String valueString = value.toString();
if (!CharMatcher.inRange((char) 0x20, (char) 0x7E).matchesAllOf(valueString)) {
throw new TrinoException(HIVE_INVALID_PARTITION_VALUE,
"Hive partition keys can only contain printable ASCII characters (0x20 - 0x7E). Invalid value: " +
base16().withSeparator(" ", 2).encode(valueString.getBytes(UTF_8)));
}
partitionValues.add(valueString);
}
}
return partitionValues.build();
}
public static Object getField(DateTimeZone localZone, Type type, Block block, int position)
{
if (block.isNull(position)) {
return null;
}
if (BOOLEAN.equals(type)) {
return type.getBoolean(block, position);
}
if (BIGINT.equals(type)) {
return type.getLong(block, position);
}
if (INTEGER.equals(type)) {
return toIntExact(type.getLong(block, position));
}
if (SMALLINT.equals(type)) {
return Shorts.checkedCast(type.getLong(block, position));
}
if (TINYINT.equals(type)) {
return SignedBytes.checkedCast(type.getLong(block, position));
}
if (REAL.equals(type)) {
return intBitsToFloat((int) type.getLong(block, position));
}
if (DOUBLE.equals(type)) {
return type.getDouble(block, position);
}
if (type instanceof VarcharType) {
return new Text(type.getSlice(block, position).getBytes());
}
if (type instanceof CharType) {
CharType charType = (CharType) type;
return new Text(padSpaces(type.getSlice(block, position), charType).toStringUtf8());
}
if (VARBINARY.equals(type)) {
return type.getSlice(block, position).getBytes();
}
if (DATE.equals(type)) {
return Date.ofEpochDay(toIntExact(type.getLong(block, position)));
}
if (type instanceof TimestampType) {
return getHiveTimestamp(localZone, (TimestampType) type, block, position);
}
if (type instanceof DecimalType) {
DecimalType decimalType = (DecimalType) type;
return getHiveDecimal(decimalType, block, position);
}
if (type instanceof ArrayType) {
Type elementType = ((ArrayType) type).getElementType();
Block arrayBlock = block.getObject(position, Block.class);
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy