
hivemall.utils.hadoop.JsonSerdeUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
// This file codes borrowed from
// - org.apache.hive.hcatalog.data.JsonSerDe
package hivemall.utils.hadoop;
import hivemall.utils.io.FastByteArrayInputStream;
import hivemall.utils.lang.Preconditions;
import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.Text;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
public final class JsonSerdeUtils {
/**
* Serialize Hive objects as Text.
*/
@Nonnull
public static Text serialize(@Nullable final Object obj, @Nonnull final ObjectInspector oi)
throws SerDeException {
return serialize(obj, oi, null);
}
/**
* Serialize Hive objects as Text.
*/
@Nonnull
public static Text serialize(@Nullable final Object obj, @Nonnull final ObjectInspector oi,
@Nullable final List columnNames) throws SerDeException {
final StringBuilder sb = new StringBuilder();
switch (oi.getCategory()) {
case STRUCT:
StructObjectInspector soi = (StructObjectInspector) oi;
serializeStruct(sb, obj, soi, columnNames);
break;
case LIST:
ListObjectInspector loi = (ListObjectInspector) oi;
serializeList(sb, obj, loi);
break;
case MAP:
MapObjectInspector moi = (MapObjectInspector) oi;
serializeMap(sb, obj, moi);
break;
case PRIMITIVE:
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
serializePrimitive(sb, obj, poi);
break;
default:
throw new SerDeException("Unknown type in ObjectInspector: " + oi.getCategory());
}
return new Text(sb.toString());
}
/**
* Serialize Hive objects as Text.
*/
private static void serializeStruct(@Nonnull final StringBuilder sb, @Nullable final Object obj,
@Nonnull final StructObjectInspector soi, @Nullable final List columnNames)
throws SerDeException {
if (obj == null) {
sb.append("null");
} else {
final List extends StructField> structFields = soi.getAllStructFieldRefs();
sb.append(SerDeUtils.LBRACE);
if (columnNames == null) {
for (int i = 0, len = structFields.size(); i < len; i++) {
String colName = structFields.get(i).getFieldName();
if (i > 0) {
sb.append(SerDeUtils.COMMA);
}
appendWithQuotes(sb, colName);
sb.append(SerDeUtils.COLON);
buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)),
structFields.get(i).getFieldObjectInspector());
}
} else if (columnNames.size() == structFields.size()) {
for (int i = 0, len = structFields.size(); i < len; i++) {
if (i > 0) {
sb.append(SerDeUtils.COMMA);
}
String colName = columnNames.get(i);
appendWithQuotes(sb, colName);
sb.append(SerDeUtils.COLON);
buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)),
structFields.get(i).getFieldObjectInspector());
}
} else {
Collections.sort(columnNames);
final List found = new ArrayList<>(columnNames.size());
for (int i = 0, len = structFields.size(); i < len; i++) {
String colName = structFields.get(i).getFieldName();
if (Collections.binarySearch(columnNames, colName) < 0) {
continue;
}
if (!found.isEmpty()) {
sb.append(SerDeUtils.COMMA);
}
appendWithQuotes(sb, colName);
sb.append(SerDeUtils.COLON);
buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)),
structFields.get(i).getFieldObjectInspector());
found.add(colName);
}
if (found.size() != columnNames.size()) {
ArrayList expected = new ArrayList<>(columnNames);
expected.removeAll(found);
throw new SerDeException("Could not find some fields: " + expected);
}
}
sb.append(SerDeUtils.RBRACE);
}
}
@Nonnull
private static void serializeList(@Nonnull final StringBuilder sb, @Nullable final Object obj,
@Nullable final ListObjectInspector loi) throws SerDeException {
ObjectInspector listElementObjectInspector = loi.getListElementObjectInspector();
List> olist = loi.getList(obj);
if (olist == null) {
sb.append("null");
} else {
sb.append(SerDeUtils.LBRACKET);
for (int i = 0; i < olist.size(); i++) {
if (i > 0) {
sb.append(SerDeUtils.COMMA);
}
buildJSONString(sb, olist.get(i), listElementObjectInspector);
}
sb.append(SerDeUtils.RBRACKET);
}
}
private static void serializeMap(@Nonnull final StringBuilder sb, @Nullable final Object obj,
@Nonnull final MapObjectInspector moi) throws SerDeException {
ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector();
ObjectInspector mapValueObjectInspector = moi.getMapValueObjectInspector();
Map, ?> omap = moi.getMap(obj);
if (omap == null) {
sb.append("null");
} else {
sb.append(SerDeUtils.LBRACE);
boolean first = true;
for (Object entry : omap.entrySet()) {
if (first) {
first = false;
} else {
sb.append(SerDeUtils.COMMA);
}
Map.Entry, ?> e = (Map.Entry, ?>) entry;
StringBuilder keyBuilder = new StringBuilder();
buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector);
String keyString = keyBuilder.toString().trim();
if ((!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE)) {
appendWithQuotes(sb, keyString);
} else {
sb.append(keyString);
}
sb.append(SerDeUtils.COLON);
buildJSONString(sb, e.getValue(), mapValueObjectInspector);
}
sb.append(SerDeUtils.RBRACE);
}
}
private static void serializePrimitive(@Nonnull final StringBuilder sb,
@Nullable final Object obj, @Nullable final PrimitiveObjectInspector poi)
throws SerDeException {
if (obj == null) {
sb.append("null");
} else {
switch (poi.getPrimitiveCategory()) {
case BOOLEAN: {
boolean b = ((BooleanObjectInspector) poi).get(obj);
sb.append(b ? "true" : "false");
break;
}
case BYTE: {
sb.append(((ByteObjectInspector) poi).get(obj));
break;
}
case SHORT: {
sb.append(((ShortObjectInspector) poi).get(obj));
break;
}
case INT: {
sb.append(((IntObjectInspector) poi).get(obj));
break;
}
case LONG: {
sb.append(((LongObjectInspector) poi).get(obj));
break;
}
case FLOAT: {
sb.append(((FloatObjectInspector) poi).get(obj));
break;
}
case DOUBLE: {
sb.append(((DoubleObjectInspector) poi).get(obj));
break;
}
case STRING: {
String s = SerDeUtils.escapeString(
((StringObjectInspector) poi).getPrimitiveJavaObject(obj));
appendWithQuotes(sb, s);
break;
}
case BINARY:
byte[] b = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(obj);
Text txt = new Text();
txt.set(b, 0, b.length);
appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString()));
break;
case DATE:
Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(obj);
appendWithQuotes(sb, d.toString());
break;
case TIMESTAMP: {
Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(obj);
appendWithQuotes(sb, t.toString());
break;
}
case DECIMAL:
sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(obj));
break;
case VARCHAR: {
String s = SerDeUtils.escapeString(
((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(obj).toString());
appendWithQuotes(sb, s);
break;
}
case CHAR: {
//this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13)
// HiveChar.toString() returns getPaddedValue()
String s = SerDeUtils.escapeString(
((HiveCharObjectInspector) poi).getPrimitiveJavaObject(obj).toString());
appendWithQuotes(sb, s);
break;
}
default:
throw new SerDeException(
"Unknown primitive type: " + poi.getPrimitiveCategory());
}
}
}
private static void buildJSONString(@Nonnull final StringBuilder sb, @Nullable final Object obj,
@Nonnull final ObjectInspector oi) throws SerDeException {
switch (oi.getCategory()) {
case PRIMITIVE: {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
serializePrimitive(sb, obj, poi);
break;
}
case LIST: {
ListObjectInspector loi = (ListObjectInspector) oi;
serializeList(sb, obj, loi);
break;
}
case MAP: {
MapObjectInspector moi = (MapObjectInspector) oi;
serializeMap(sb, obj, moi);
break;
}
case STRUCT: {
StructObjectInspector soi = (StructObjectInspector) oi;
serializeStruct(sb, obj, soi, null);
break;
}
case UNION: {
UnionObjectInspector uoi = (UnionObjectInspector) oi;
if (obj == null) {
sb.append("null");
} else {
sb.append(SerDeUtils.LBRACE);
sb.append(uoi.getTag(obj));
sb.append(SerDeUtils.COLON);
buildJSONString(sb, uoi.getField(obj),
uoi.getObjectInspectors().get(uoi.getTag(obj)));
sb.append(SerDeUtils.RBRACE);
}
break;
}
default:
throw new SerDeException("Unknown type in ObjectInspector: " + oi.getCategory());
}
}
@Nonnull
public static T deserialize(@Nonnull final Text t) throws SerDeException {
return deserialize(t, null, null);
}
/**
* Deserialize Json array or Json primitives.
*/
@Nonnull
public static T deserialize(@Nonnull final Text t, @Nonnull TypeInfo columnTypes)
throws SerDeException {
return deserialize(t, null, Arrays.asList(columnTypes));
}
@SuppressWarnings("unchecked")
@Nonnull
public static T deserialize(@Nonnull final Text t, @Nullable final List columnNames,
@Nullable final List columnTypes) throws SerDeException {
final Object result;
try {
JsonParser p =
new JsonFactory().createJsonParser(new FastByteArrayInputStream(t.getBytes(), t.getLength()));
final JsonToken token = p.nextToken();
if (token == JsonToken.START_OBJECT) {
result = parseObject(p, columnNames, columnTypes);
} else if (token == JsonToken.START_ARRAY) {
result = parseArray(p, columnTypes);
} else {
result = parseValue(p);
}
} catch (JsonParseException e) {
throw new SerDeException(e);
} catch (IOException e) {
throw new SerDeException(e);
}
return (T) result;
}
@Nonnull
private static Object parseObject(@Nonnull final JsonParser p,
@CheckForNull final List columnNames,
@CheckForNull final List columnTypes)
throws JsonParseException, IOException, SerDeException {
Preconditions.checkNotNull(columnNames, "columnNames MUST NOT be null in parseObject",
SerDeException.class);
Preconditions.checkNotNull(columnTypes, "columnTypes MUST NOT be null in parseObject",
SerDeException.class);
if (columnNames.size() != columnTypes.size()) {
throw new SerDeException(
"Size of columnNames and columnTypes does not match. #columnNames="
+ columnNames.size() + ", #columnTypes=" + columnTypes.size());
}
TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
final HCatSchema schema;
try {
schema = HCatSchemaUtils.getHCatSchema(rowTypeInfo).get(0).getStructSubSchema();
} catch (HCatException e) {
throw new SerDeException(e);
}
final List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy