All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.io.orc.RecordReaderFactory Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import com.google.common.collect.Lists; /** * Factory to create ORC tree readers. It also compares file schema with schema specified on read * to see if type promotions are possible. */ public class RecordReaderFactory { static final Log LOG = LogFactory.getLog(RecordReaderFactory.class); private static final boolean isLogInfoEnabled = LOG.isInfoEnabled(); public static TreeReaderFactory.TreeReader createTreeReader(int colId, Configuration conf, List fileSchema, boolean[] included, boolean skipCorrupt) throws IOException { final boolean isAcid = checkAcidSchema(fileSchema); final List originalFileSchema; if (isAcid) { originalFileSchema = fileSchema.subList(fileSchema.get(0).getSubtypesCount(), fileSchema.size()); } else { originalFileSchema = fileSchema; } final int numCols = originalFileSchema.get(0).getSubtypesCount(); List schemaOnRead = getSchemaOnRead(numCols, conf); List schemaUsed = getMatchingSchema(fileSchema, schemaOnRead); if (schemaUsed == null) { return TreeReaderFactory.createTreeReader(colId, fileSchema, included, skipCorrupt); } else { return ConversionTreeReaderFactory.createTreeReader(colId, schemaUsed, included, skipCorrupt); } } private static boolean checkAcidSchema(List fileSchema) { if (fileSchema.get(0).getKind().equals(OrcProto.Type.Kind.STRUCT)) { List acidFields = OrcRecordUpdater.getAcidEventFields(); List rootFields = fileSchema.get(0).getFieldNamesList(); if (acidFields.equals(rootFields)) { return true; } } return false; } private static List getMatchingSchema(List fileSchema, List schemaOnRead) { if (schemaOnRead == null) { if (isLogInfoEnabled) { LOG.info("Schema is not specified on read. Using file schema."); } return null; } if (fileSchema.size() != schemaOnRead.size()) { if (isLogInfoEnabled) { LOG.info("Schema on read column count does not match file schema's column count." + " Falling back to using file schema."); } return null; } else { List result = Lists.newArrayList(fileSchema); // check type promotion. ORC can only support type promotions for integer types // short -> int -> bigint as same integer readers are used for the above types. boolean canPromoteType = false; for (int i = 0; i < fileSchema.size(); i++) { OrcProto.Type fColType = fileSchema.get(i); OrcProto.Type rColType = schemaOnRead.get(i); if (!fColType.getKind().equals(rColType.getKind())) { if (fColType.getKind().equals(OrcProto.Type.Kind.SHORT)) { if (rColType.getKind().equals(OrcProto.Type.Kind.INT) || rColType.getKind().equals(OrcProto.Type.Kind.LONG)) { // type promotion possible, converting SHORT to INT/LONG requested type result.set(i, result.get(i).toBuilder().setKind(rColType.getKind()).build()); canPromoteType = true; } else { canPromoteType = false; } } else if (fColType.getKind().equals(OrcProto.Type.Kind.INT)) { if (rColType.getKind().equals(OrcProto.Type.Kind.LONG)) { // type promotion possible, converting INT to LONG requested type result.set(i, result.get(i).toBuilder().setKind(rColType.getKind()).build()); canPromoteType = true; } else { canPromoteType = false; } } else { canPromoteType = false; } } } if (canPromoteType) { if (isLogInfoEnabled) { LOG.info("Integer type promotion happened in ORC record reader. Using promoted schema."); } return result; } } return null; } private static List getSchemaOnRead(int numCols, Configuration conf) { String columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES); final String columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS); if (columnTypeProperty == null || columnNameProperty == null) { return null; } ArrayList columnNames = Lists.newArrayList(columnNameProperty.split(",")); ArrayList fieldTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); StructTypeInfo structTypeInfo = new StructTypeInfo(); // Column types from conf includes virtual and partition columns at the end. We consider only // the actual columns in the file. structTypeInfo.setAllStructFieldNames(Lists.newArrayList(columnNames.subList(0, numCols))); structTypeInfo.setAllStructFieldTypeInfos(Lists.newArrayList(fieldTypes.subList(0, numCols))); ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(structTypeInfo); return getOrcTypes(oi); } private static List getOrcTypes(ObjectInspector inspector) { List result = Lists.newArrayList(); getOrcTypesImpl(result, inspector); return result; } private static void getOrcTypesImpl(List result, ObjectInspector inspector) { OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); switch (inspector.getCategory()) { case PRIMITIVE: switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { case BOOLEAN: type.setKind(OrcProto.Type.Kind.BOOLEAN); break; case BYTE: type.setKind(OrcProto.Type.Kind.BYTE); break; case SHORT: type.setKind(OrcProto.Type.Kind.SHORT); break; case INT: type.setKind(OrcProto.Type.Kind.INT); break; case LONG: type.setKind(OrcProto.Type.Kind.LONG); break; case FLOAT: type.setKind(OrcProto.Type.Kind.FLOAT); break; case DOUBLE: type.setKind(OrcProto.Type.Kind.DOUBLE); break; case STRING: type.setKind(OrcProto.Type.Kind.STRING); break; case CHAR: // The char length needs to be written to file and should be available // from the object inspector CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) inspector) .getTypeInfo(); type.setKind(OrcProto.Type.Kind.CHAR); type.setMaximumLength(charTypeInfo.getLength()); break; case VARCHAR: // The varchar length needs to be written to file and should be available // from the object inspector VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) inspector) .getTypeInfo(); type.setKind(OrcProto.Type.Kind.VARCHAR); type.setMaximumLength(typeInfo.getLength()); break; case BINARY: type.setKind(OrcProto.Type.Kind.BINARY); break; case TIMESTAMP: type.setKind(OrcProto.Type.Kind.TIMESTAMP); break; case DATE: type.setKind(OrcProto.Type.Kind.DATE); break; case DECIMAL: DecimalTypeInfo decTypeInfo = (DecimalTypeInfo) ((PrimitiveObjectInspector) inspector) .getTypeInfo(); type.setKind(OrcProto.Type.Kind.DECIMAL); type.setPrecision(decTypeInfo.precision()); type.setScale(decTypeInfo.scale()); break; default: throw new IllegalArgumentException("Unknown primitive category: " + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); } result.add(type.build()); break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); result.add(type.build()); getOrcTypesImpl(result, ((ListObjectInspector) inspector).getListElementObjectInspector()); break; case MAP: type.setKind(OrcProto.Type.Kind.MAP); result.add(type.build()); getOrcTypesImpl(result, ((MapObjectInspector) inspector).getMapKeyObjectInspector()); getOrcTypesImpl(result, ((MapObjectInspector) inspector).getMapValueObjectInspector()); break; case STRUCT: type.setKind(OrcProto.Type.Kind.STRUCT); result.add(type.build()); for (StructField field : ((StructObjectInspector) inspector).getAllStructFieldRefs()) { getOrcTypesImpl(result, field.getFieldObjectInspector()); } break; case UNION: type.setKind(OrcProto.Type.Kind.UNION); result.add(type.build()); for (ObjectInspector oi : ((UnionObjectInspector) inspector).getObjectInspectors()) { getOrcTypesImpl(result, oi); } break; default: throw new IllegalArgumentException("Unknown category: " + inspector.getCategory()); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy