org.apache.hadoop.hive.hbase.HBaseSerDeHelper Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.hbase;
import static org.apache.hadoop.hive.hbase.HBaseSerDeParameters.AVRO_SERIALIZATION_TYPE;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.apache.avro.Schema;
import org.apache.avro.reflect.ReflectData;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.avro.AvroObjectInspectorGenerator;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazyObjectBase;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.util.StringUtils;
/**
* Helper class for {@link HBaseSerDe}
* */
public class HBaseSerDeHelper {
/**
* Logger
* */
public static final Log LOG = LogFactory.getLog(HBaseSerDeHelper.class);
/**
* Autogenerates the columns from the given serialization class
*
* @param tbl the hive table properties
* @param columnsMapping the hbase columns mapping determining hbase column families and
* qualifiers
* @param sb StringBuilder to form the list of columns
* @throws IllegalArgumentException if any of the given arguments was null
* */
public static void generateColumns(Properties tbl, List columnsMapping,
StringBuilder sb) {
// Generate the columns according to the column mapping provided
// Note: The generated column names are same as the
// family_name.qualifier_name. If the qualifier
// name is null, each column is familyname_col[i] where i is the index of
// the column ranging
// from 0 to n-1 where n is the size of the column mapping. The filter
// function removes any
// special characters other than alphabets and numbers from the column
// family and qualifier name
// as the only special character allowed in a column name is "_" which is
// used as a separator
// between the column family and qualifier name.
if (columnsMapping == null) {
throw new IllegalArgumentException("columnsMapping cannot be null");
}
if (sb == null) {
throw new IllegalArgumentException("StringBuilder cannot be null");
}
for (int i = 0; i < columnsMapping.size(); i++) {
ColumnMapping colMap = columnsMapping.get(i);
if (colMap.hbaseRowKey) {
sb.append("key").append(StringUtils.COMMA_STR);
} else if (colMap.qualifierName == null) {
// this corresponds to a map
if (colMap.qualifierPrefix != null) {
sb.append(filter(colMap.familyName)).append("_")
.append(filter(colMap.qualifierPrefix) + i).append(StringUtils.COMMA_STR);
} else {
sb.append(filter(colMap.familyName)).append("_").append("col" + i)
.append(StringUtils.COMMA_STR);
}
} else {
// just an individual column
sb.append(filter(colMap.familyName)).append("_").append(filter(colMap.qualifierName))
.append(StringUtils.COMMA_STR);
}
}
// trim off the ending ",", if any
trim(sb);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated columns: [" + sb.toString() + "]");
}
}
/**
* Autogenerates the column types from the given serialization class
*
* @param tbl the hive table properties
* @param columnsMapping the hbase columns mapping determining hbase column families and
* qualifiers
* @param sb StringBuilder to form the list of columns
* @param conf configuration
* @throws IllegalArgumentException if any of the given arguments was null
* @throws SerDeException if there was an error generating the column types
* */
public static void generateColumnTypes(Properties tbl, List columnsMapping,
StringBuilder sb, Configuration conf) throws SerDeException {
if (tbl == null) {
throw new IllegalArgumentException("tbl cannot be null");
}
if (columnsMapping == null) {
throw new IllegalArgumentException("columnsMapping cannot be null");
}
if (sb == null) {
throw new IllegalArgumentException("StringBuilder cannot be null");
}
// Generate the columns according to the column mapping provided
for (int i = 0; i < columnsMapping.size(); i++) {
if (sb.length() > 0) {
sb.append(":");
}
ColumnMapping colMap = columnsMapping.get(i);
if (colMap.hbaseRowKey) {
Map compositeKeyParts = getCompositeKeyParts(tbl);
StringBuilder keyStruct = new StringBuilder();
if (compositeKeyParts == null || compositeKeyParts.isEmpty()) {
String compKeyClass = tbl.getProperty(HBaseSerDe.HBASE_COMPOSITE_KEY_CLASS);
String compKeyTypes = tbl.getProperty(HBaseSerDe.HBASE_COMPOSITE_KEY_TYPES);
if (compKeyTypes == null) {
if (compKeyClass != null) {
// a composite key class was provided. But neither the types
// property was set and
// neither the getParts() method of HBaseCompositeKey was
// overidden in the
// implementation. Flag exception.
throw new SerDeException(
"Either the hbase.composite.key.types property should be set or the getParts method must be overridden in "
+ compKeyClass);
}
// the row key column becomes a STRING
sb.append(serdeConstants.STRING_TYPE_NAME);
} else {
generateKeyStruct(compKeyTypes, keyStruct);
}
} else {
generateKeyStruct(compositeKeyParts, keyStruct);
}
sb.append(keyStruct);
} else if (colMap.qualifierName == null) {
String serClassName = null;
String serType = null;
String schemaLiteral = null;
String schemaUrl = null;
if (colMap.qualifierPrefix != null) {
serType =
tbl.getProperty(colMap.familyName + "." + colMap.qualifierPrefix + "."
+ HBaseSerDe.SERIALIZATION_TYPE);
if (serType == null) {
throw new SerDeException(HBaseSerDe.SERIALIZATION_TYPE
+ " property not provided for column family [" + colMap.familyName
+ "] and prefix [" + colMap.qualifierPrefix + "]");
}
// we are provided with a prefix
serClassName =
tbl.getProperty(colMap.familyName + "." + colMap.qualifierPrefix + "."
+ serdeConstants.SERIALIZATION_CLASS);
if (serClassName == null) {
if (serType.equalsIgnoreCase(HBaseSerDeParameters.AVRO_SERIALIZATION_TYPE)) {
// for avro type, the serialization class parameter is optional
schemaLiteral =
tbl.getProperty(colMap.familyName + "." + colMap.qualifierPrefix + "."
+ AvroSerdeUtils.SCHEMA_LITERAL);
schemaUrl =
tbl.getProperty(colMap.familyName + "." + colMap.qualifierPrefix + "."
+ AvroSerdeUtils.SCHEMA_URL);
if (schemaLiteral == null && schemaUrl == null) {
// either schema literal, schema url or serialization class must
// be provided
throw new SerDeException("For an avro schema, either "
+ AvroSerdeUtils.SCHEMA_LITERAL + ", " + AvroSerdeUtils.SCHEMA_URL + " or "
+ serdeConstants.SERIALIZATION_CLASS + " property must be set.");
}
if (schemaUrl != null) {
schemaLiteral = getSchemaFromFS(schemaUrl, conf).toString();
}
} else {
throw new SerDeException(serdeConstants.SERIALIZATION_CLASS
+ " property not provided for column family [" + colMap.familyName
+ "] and prefix [" + colMap.qualifierPrefix + "]");
}
}
} else {
serType = tbl.getProperty(colMap.familyName + "." + HBaseSerDe.SERIALIZATION_TYPE);
if (serType == null) {
throw new SerDeException(HBaseSerDe.SERIALIZATION_TYPE
+ " property not provided for column family [" + colMap.familyName + "]");
}
serClassName =
tbl.getProperty(colMap.familyName + "." + serdeConstants.SERIALIZATION_CLASS);
if (serClassName == null) {
if (serType.equalsIgnoreCase(AVRO_SERIALIZATION_TYPE)) {
// for avro type, the serialization class parameter is optional
schemaLiteral =
tbl.getProperty(colMap.familyName + "." + AvroSerdeUtils.SCHEMA_LITERAL);
schemaUrl = tbl.getProperty(colMap.familyName + "." + AvroSerdeUtils.SCHEMA_URL);
if (schemaLiteral == null && schemaUrl == null) {
// either schema literal or serialization class must be provided
throw new SerDeException("For an avro schema, either "
+ AvroSerdeUtils.SCHEMA_LITERAL + " property or "
+ serdeConstants.SERIALIZATION_CLASS + " property must be set.");
}
if (schemaUrl != null) {
schemaLiteral = getSchemaFromFS(schemaUrl, conf).toString();
}
} else {
throw new SerDeException(serdeConstants.SERIALIZATION_CLASS
+ " property not provided for column family [" + colMap.familyName + "]");
}
}
}
StringBuilder generatedStruct = new StringBuilder();
// generate struct for each of the given prefixes
generateColumnStruct(serType, serClassName, schemaLiteral, colMap, generatedStruct);
// a column family becomes a MAP
sb.append(serdeConstants.MAP_TYPE_NAME + "<" + serdeConstants.STRING_TYPE_NAME + ","
+ generatedStruct + ">");
} else {
String qualifierName = colMap.qualifierName;
if (colMap.qualifierName.endsWith("*")) {
// we are provided with a prefix
qualifierName = colMap.qualifierName.substring(0, colMap.qualifierName.length() - 1);
}
String serType =
tbl.getProperty(colMap.familyName + "." + qualifierName + "."
+ HBaseSerDe.SERIALIZATION_TYPE);
if (serType == null) {
throw new SerDeException(HBaseSerDe.SERIALIZATION_TYPE
+ " property not provided for column family [" + colMap.familyName
+ "] and qualifier [" + qualifierName + "]");
}
String serClassName =
tbl.getProperty(colMap.familyName + "." + qualifierName + "."
+ serdeConstants.SERIALIZATION_CLASS);
String schemaLiteral = null;
String schemaUrl = null;
if (serClassName == null) {
if (serType.equalsIgnoreCase(AVRO_SERIALIZATION_TYPE)) {
// for avro type, the serialization class parameter is optional
schemaLiteral =
tbl.getProperty(colMap.familyName + "." + qualifierName + "."
+ AvroSerdeUtils.SCHEMA_LITERAL);
schemaUrl =
tbl.getProperty(colMap.familyName + "." + qualifierName + "."
+ AvroSerdeUtils.SCHEMA_URL);
if (schemaLiteral == null && schemaUrl == null) {
// either schema literal, schema url or serialization class must
// be provided
throw new SerDeException("For an avro schema, either "
+ AvroSerdeUtils.SCHEMA_LITERAL + ", " + AvroSerdeUtils.SCHEMA_URL + " or "
+ serdeConstants.SERIALIZATION_CLASS + " property must be set.");
}
if (schemaUrl != null) {
schemaLiteral = getSchemaFromFS(schemaUrl, conf).toString();
}
} else {
throw new SerDeException(serdeConstants.SERIALIZATION_CLASS
+ " property not provided for column family [" + colMap.familyName
+ "] and qualifier [" + qualifierName + "]");
}
}
StringBuilder generatedStruct = new StringBuilder();
generateColumnStruct(serType, serClassName, schemaLiteral, colMap, generatedStruct);
sb.append(generatedStruct);
}
}
// trim off ending ",", if any
trim(sb);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated column types: [" + sb.toString() + "]");
}
}
/**
* Read the schema from the given hdfs url for the schema
* */
public static Schema getSchemaFromFS(String schemaFSUrl, Configuration conf)
throws SerDeException {
FSDataInputStream in = null;
FileSystem fs = null;
try {
fs = FileSystem.get(new URI(schemaFSUrl), conf);
in = fs.open(new Path(schemaFSUrl));
Schema s = Schema.parse(in);
return s;
} catch (URISyntaxException e) {
throw new SerDeException("Failure reading schema from filesystem", e);
} catch (IOException e) {
throw new SerDeException("Failure reading schema from filesystem", e);
} finally {
IOUtils.closeQuietly(in);
}
}
/**
* Create the {@link LazyObjectBase lazy field}
* */
public static LazyObjectBase createLazyField(ColumnMapping[] columnMappings, int fieldID,
ObjectInspector inspector) {
ColumnMapping colMap = columnMappings[fieldID];
if (colMap.getQualifierName() == null && !colMap.isHbaseRowKey()) {
// a column family
return new LazyHBaseCellMap((LazyMapObjectInspector) inspector);
}
return LazyFactory.createLazyObject(inspector, colMap.getBinaryStorage().get(0));
}
/**
* Auto-generates the key struct for composite keys
*
* @param compositeKeyParts map of composite key part name to its type. Usually this would be
* provided by the custom implementation of {@link HBaseCompositeKey composite key}
* @param sb StringBuilder object to construct the struct
* */
private static void generateKeyStruct(Map compositeKeyParts, StringBuilder sb) {
sb.append("struct<");
for (Entry entry : compositeKeyParts.entrySet()) {
sb.append(entry.getKey()).append(":").append(entry.getValue()).append(",");
}
// trim the trailing ","
trim(sb);
sb.append(">");
}
/**
* Auto-generates the key struct for composite keys
*
* @param compositeKeyTypes comma separated list of composite key types in order
* @param sb StringBuilder object to construct the struct
* */
private static void generateKeyStruct(String compositeKeyTypes, StringBuilder sb) {
sb.append("struct<");
// composite key types is a comma separated list of different parts of the
// composite keys in
// order in which they appear in the key
String[] keyTypes = compositeKeyTypes.split(",");
for (int i = 0; i < keyTypes.length; i++) {
sb.append("col" + i).append(":").append(keyTypes[i]).append(StringUtils.COMMA_STR);
}
// trim the trailing ","
trim(sb);
sb.append(">");
}
/**
* Auto-generates the column struct
*
* @param serType serialization type
* @param serClassName serialization class name
* @param schemaLiteral schema string
* @param colMap hbase column mapping
* @param sb StringBuilder to hold the generated struct
* @throws SerDeException if something goes wrong while generating the struct
* */
private static void generateColumnStruct(String serType, String serClassName,
String schemaLiteral, ColumnMapping colMap, StringBuilder sb) throws SerDeException {
if (serType.equalsIgnoreCase(AVRO_SERIALIZATION_TYPE)) {
if (serClassName != null) {
generateAvroStructFromClass(serClassName, sb);
} else {
generateAvroStructFromSchema(schemaLiteral, sb);
}
} else {
throw new SerDeException("Unknown " + HBaseSerDe.SERIALIZATION_TYPE
+ " found for column family [" + colMap.familyName + "]");
}
}
/**
* Auto-generate the avro struct from class
*
* @param serClassName serialization class for avro struct
* @param sb StringBuilder to hold the generated struct
* @throws SerDeException if something goes wrong while generating the struct
* */
private static void generateAvroStructFromClass(String serClassName, StringBuilder sb)
throws SerDeException {
Class> serClass;
try {
serClass = JavaUtils.loadClass(serClassName);
} catch (ClassNotFoundException e) {
throw new SerDeException("Error obtaining descriptor for " + serClassName, e);
}
Schema schema = ReflectData.get().getSchema(serClass);
generateAvroStructFromSchema(schema, sb);
}
/**
* Auto-generate the avro struct from schema
*
* @param schemaLiteral schema for the avro struct as string
* @param sb StringBuilder to hold the generated struct
* @throws SerDeException if something goes wrong while generating the struct
* */
private static void generateAvroStructFromSchema(String schemaLiteral, StringBuilder sb)
throws SerDeException {
Schema schema = Schema.parse(schemaLiteral);
generateAvroStructFromSchema(schema, sb);
}
/**
* Auto-generate the avro struct from schema
*
* @param schema schema for the avro struct
* @param sb StringBuilder to hold the generated struct
* @throws SerDeException if something goes wrong while generating the struct
* */
private static void generateAvroStructFromSchema(Schema schema, StringBuilder sb)
throws SerDeException {
AvroObjectInspectorGenerator avig = new AvroObjectInspectorGenerator(schema);
sb.append("struct<");
// Get the column names and their corresponding types
List columnNames = avig.getColumnNames();
List columnTypes = avig.getColumnTypes();
if (columnNames.size() != columnTypes.size()) {
throw new AssertionError("The number of column names should be the same as column types");
}
for (int i = 0; i < columnNames.size(); i++) {
sb.append(columnNames.get(i));
sb.append(":");
sb.append(columnTypes.get(i).getTypeName());
sb.append(",");
}
trim(sb).append(">");
}
/**
* Trims by removing the trailing "," if any
*
* @param sb StringBuilder to trim
* @return StringBuilder trimmed StringBuilder
* */
private static StringBuilder trim(StringBuilder sb) {
if (sb.charAt(sb.length() - 1) == StringUtils.COMMA) {
return sb.deleteCharAt(sb.length() - 1);
}
return sb;
}
/**
* Filters the given name by removing any special character and convert to lowercase
* */
private static String filter(String name) {
return name.replaceAll("[^a-zA-Z0-9]+", "").toLowerCase();
}
/**
* Return the types for the composite key.
*
* @param tbl Properties for the table
* @return a comma-separated list of composite key types
* @throws SerDeException if something goes wrong while getting the composite key parts
* */
@SuppressWarnings("unchecked")
private static Map getCompositeKeyParts(Properties tbl) throws SerDeException {
String compKeyClassName = tbl.getProperty(HBaseSerDe.HBASE_COMPOSITE_KEY_CLASS);
if (compKeyClassName == null) {
// no custom composite key class provided. return null
return null;
}
CompositeHBaseKeyFactory keyFactory = null;
Class> keyClass;
try {
keyClass = JavaUtils.loadClass(compKeyClassName);
keyFactory = new CompositeHBaseKeyFactory(keyClass);
} catch (Exception e) {
throw new SerDeException(e);
}
HBaseCompositeKey compKey = keyFactory.createKey(null);
return compKey.getParts();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy