All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.proofpoint.hive.serde.JsonSerde Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2011 Proofpoint, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.proofpoint.hive.serde;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.Writable;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.map.ObjectMapper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import static java.lang.String.format;
import static java.util.Arrays.asList;
import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import static org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getStructTypeInfo;
import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo;
import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getTypeInfosFromTypeString;

public class JsonSerde
        implements SerDe
{
    private final JsonFactory jsonFactory = new ObjectMapper().getJsonFactory();
    private ObjectInspector rowObjectInspector;
    protected StructTypeInfo rootTypeInfo;
    protected ColumnNameMap columnNameMap;
    private boolean ignoreErrors;

    @Override
    public void initialize(Configuration configuration, Properties table)
            throws SerDeException
    {
        String columnNamesProperty = table.getProperty(Constants.LIST_COLUMNS);
        if ((columnNamesProperty == null) || columnNamesProperty.isEmpty()) {
            throw new SerDeException("table has no columns");
        }
        String columnTypesProperty = table.getProperty(Constants.LIST_COLUMN_TYPES);
        if ((columnTypesProperty == null) || columnTypesProperty.isEmpty()) {
            throw new SerDeException("table has no column types");
        }

        List columnNames = asList(columnNamesProperty.split(","));
        List columnTypes = getTypeInfosFromTypeString(columnTypesProperty);
        if (columnNames.size() != columnTypes.size()) {
            throw new SerDeException(format("columns size (%s) does not match column types size (%s)", columnNames.size(), columnTypes.size()));
        }

        rootTypeInfo = (StructTypeInfo) getStructTypeInfo(columnNames, columnTypes);
        rowObjectInspector = getStandardJavaObjectInspectorFromTypeInfo(rootTypeInfo);

        columnNameMap = new ColumnNameMap(rootTypeInfo);

        ignoreErrors = Boolean.parseBoolean(table.getProperty("errors.ignore"));
    }

    @Override
    public Class getSerializedClass()
    {
        throw new UnsupportedOperationException("serialization not supported");
    }

    @Override
    public Writable serialize(Object o, ObjectInspector objectInspector)
            throws SerDeException
    {
        throw new UnsupportedOperationException("serialization not supported");
    }

    @Override
    public Object deserialize(Writable writable)
            throws SerDeException
    {
        if (!(writable instanceof BinaryComparable)) {
            throw new SerDeException("expected BinaryComparable: " + writable.getClass().getName());
        }

        try {
            return doDeserialize((BinaryComparable) writable);
        }
        catch (SerDeException e) {
            if (ignoreErrors) {
                return null;
            }
            throw e;
        }
    }

    private Object doDeserialize(BinaryComparable binary)
            throws SerDeException
    {
        try {
            JsonParser jsonParser = jsonFactory.createJsonParser(binary.getBytes(), 0, binary.getLength());
            return buildStruct(jsonParser.readValueAsTree());
        }
        catch (IOException e) {
            throw new SerDeException("error parsing JSON", e);
        }
    }

    @Override
    public ObjectInspector getObjectInspector()
            throws SerDeException
    {
        return rowObjectInspector;
    }

    protected Object buildStruct(JsonNode tree)
            throws SerDeException
    {
        return processFields(tree);
    }

    protected Object[] processFields(JsonNode tree)
            throws SerDeException
    {
        return getStructNodeValue(null, tree, rootTypeInfo);
    }

    private Object getNodeValue(String columnName, JsonNode node, TypeInfo typeInfo)
            throws SerDeException
    {
        if (node.isNull()) {
            return null;
        }
        switch (typeInfo.getCategory()) {
            case LIST:
                return getListNodeValue(columnName, node, (ListTypeInfo) typeInfo);
            case MAP:
                return getMapNodeValue(columnName, node, (MapTypeInfo) typeInfo);
            case PRIMITIVE:
                return getPrimitiveNodeValue(columnName, node, (PrimitiveTypeInfo) typeInfo);
            case STRUCT:
                return getStructNodeValue(columnName, node, (StructTypeInfo) typeInfo);
            default:
                throw new SerDeException(format("unexpected type category (%s) for column: %s", typeInfo.getCategory(), columnName));
        }
    }

    private Object getListNodeValue(String columnName, JsonNode node, ListTypeInfo typeInfo)
            throws SerDeException
    {
        if (!node.isArray()) {
            throw new SerDeException(format("expected list, found %s for column %s", node.getClass().getSimpleName(), columnName));
        }
        List list = new ArrayList(node.size());
        for (JsonNode item : node) {
            list.add(getNodeValue(columnName, item, typeInfo.getListElementTypeInfo()));
        }
        return list;
    }

    private Object getMapNodeValue(String columnName, JsonNode node, MapTypeInfo typeInfo)
            throws SerDeException
    {
        if (!node.isObject()) {
            throw new SerDeException(format("expected map, found %s for column %s", node.getClass().getSimpleName(), columnName));
        }
        if (typeInfo.getMapKeyTypeInfo().getCategory() != Category.PRIMITIVE) {
            throw new SerDeException("map key is not a primitive: " + typeInfo.getMapKeyTypeInfo());
        }
        PrimitiveTypeInfo keyType = (PrimitiveTypeInfo) typeInfo.getMapKeyTypeInfo();
        if (keyType.getPrimitiveCategory() != PrimitiveCategory.STRING) {
            throw new SerDeException(format("expected STRING map key, found %s for column %s", keyType.getPrimitiveCategory(), columnName));
        }

        Map map = new CaseInsensitiveMap();
        Iterator> fields = node.getFields();
        while (fields.hasNext()) {
            Map.Entry entry = fields.next();
            String key = entry.getKey();
            JsonNode value = entry.getValue();

            Object object = getNodeValue(columnName, value, typeInfo.getMapValueTypeInfo());
            if (map.put(key, object) != null) {
                throw new SerDeException(format("column %s case-insensitive map already contains key: %s", columnName, key));
            }
        }
        return map;
    }

    private Object getPrimitiveNodeValue(String columnName, JsonNode node, PrimitiveTypeInfo typeInfo)
            throws SerDeException
    {
        if (!node.isValueNode()) {
            throw new SerDeException(format("expected primitive, found %s for column %s", node.getClass().getSimpleName(), columnName));
        }
        switch (typeInfo.getPrimitiveCategory()) {
            case VOID:
                throw new SerDeException("cannot deserialize to VOID type for column " + columnName);
            case UNKNOWN:
                throw new SerDeException("cannot deserialize to UNKNOWN type for column " + columnName);
            case BOOLEAN:
                return node.getBooleanValue();
            case BYTE:
                return (byte) node.getIntValue();
            case SHORT:
                return (short) node.getIntValue();
            case INT:
                return node.getIntValue();
            case LONG:
                return node.getLongValue();
            case FLOAT:
                return (float) node.getDoubleValue();
            case DOUBLE:
                return node.getDoubleValue();
            case STRING:
                return node.getTextValue();
            default:
                throw new SerDeException("unhandled primitive type: " + typeInfo.getPrimitiveCategory());
        }
    }

    private Object[] getStructNodeValue(String columnName, JsonNode node, StructTypeInfo typeInfo)
            throws SerDeException
    {
        Map columnNames = columnNameMap.getColumnNames(typeInfo);
        List fieldTypes = typeInfo.getAllStructFieldTypeInfos();

        Object[] struct = new Object[fieldTypes.size()];
        Iterator> fields = node.getFields();
        while (fields.hasNext()) {
            Map.Entry entry = fields.next();
            String key = entry.getKey().toLowerCase();
            JsonNode value = entry.getValue();

            Integer columnIndex = columnNames.get(key);
            if (columnIndex != null) {
                String structColumn = (columnName == null) ? key : (columnName + "." + key);
                struct[columnIndex] = getNodeValue(structColumn, value, fieldTypes.get(columnIndex));
            }
        }
        return struct;
    }
}