All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hcatalog.data.JsonSerDe Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.hcatalog.data;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hcatalog.common.HCatException;
import org.apache.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hcatalog.data.schema.HCatFieldSchema.Type;
import org.apache.hcatalog.data.schema.HCatSchema;
import org.apache.hcatalog.data.schema.HCatSchemaUtils;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @deprecated Use/modify {@link org.apache.hive.hcatalog.data.JsonSerDe} instead
 */
public class JsonSerDe implements SerDe {

  private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class);
  private List columnNames;
  private List columnTypes;

  private StructTypeInfo rowTypeInfo;
  private HCatSchema schema;

  private JsonFactory jsonFactory = null;

  private HCatRecordObjectInspector cachedObjectInspector;

  @Override
  public void initialize(Configuration conf, Properties tbl)
    throws SerDeException {


    LOG.debug("Initializing JsonSerDe");
    LOG.debug("props to serde: {}", tbl.entrySet());


    // Get column names and types
    String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
    String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);

    // all table column names
    if (columnNameProperty.length() == 0) {
      columnNames = new ArrayList();
    } else {
      columnNames = Arrays.asList(columnNameProperty.split(","));
    }

    // all column types
    if (columnTypeProperty.length() == 0) {
      columnTypes = new ArrayList();
    } else {
      columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }

    LOG.debug("columns: {}, {}", columnNameProperty, columnNames);
    LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes);

    assert (columnNames.size() == columnTypes.size());

    rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);

    cachedObjectInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo);
    try {
      schema = HCatSchemaUtils.getHCatSchema(rowTypeInfo).get(0).getStructSubSchema();
      LOG.debug("schema : {}", schema);
      LOG.debug("fields : {}", schema.getFieldNames());
    } catch (HCatException e) {
      throw new SerDeException(e);
    }

    jsonFactory = new JsonFactory();
  }

  /**
   * Takes JSON string in Text form, and has to return an object representation above
   * it that's readable by the corresponding object inspector.
   *
   * For this implementation, since we're using the jackson parser, we can construct
   * our own object implementation, and we use HCatRecord for it
   */
  @Override
  public Object deserialize(Writable blob) throws SerDeException {

    Text t = (Text) blob;
    JsonParser p;
    List r = new ArrayList(Collections.nCopies(columnNames.size(), null));
    try {
      p = jsonFactory.createJsonParser(new ByteArrayInputStream((t.getBytes())));
      if (p.nextToken() != JsonToken.START_OBJECT) {
        throw new IOException("Start token not found where expected");
      }
      JsonToken token;
      while (((token = p.nextToken()) != JsonToken.END_OBJECT) && (token != null)) {
        // iterate through each token, and create appropriate object here.
        populateRecord(r, token, p, schema);
      }
    } catch (JsonParseException e) {
      LOG.warn("Error [{}] parsing json text [{}].", e, t);
      LOG.debug(null, e);
      throw new SerDeException(e);
    } catch (IOException e) {
      LOG.warn("Error [{}] parsing json text [{}].", e, t);
      LOG.debug(null, e);
      throw new SerDeException(e);
    }

    return new DefaultHCatRecord(r);
  }

  private void populateRecord(List r, JsonToken token, JsonParser p, HCatSchema s) throws IOException {
    if (token != JsonToken.FIELD_NAME) {
      throw new IOException("Field name expected");
    }
    String fieldName = p.getText();
    int fpos;
    try {
      fpos = s.getPosition(fieldName);
    } catch (NullPointerException npe) {
      fpos = getPositionFromHiveInternalColumnName(fieldName);
      LOG.debug("NPE finding position for field [{}] in schema [{}]", fieldName, s);
      if (!fieldName.equalsIgnoreCase(getHiveInternalColumnName(fpos))) {
        LOG.error("Hive internal column name {} and position "
          + "encoding {} for the column name are at odds", fieldName, fpos);
        throw npe;
      }
      if (fpos == -1) {
        return; // unknown field, we return.
      }
    }
    HCatFieldSchema hcatFieldSchema = s.getFields().get(fpos);
    Object currField = extractCurrentField(p, null, hcatFieldSchema, false);
    r.set(fpos, currField);
  }

  public String getHiveInternalColumnName(int fpos) {
    return HiveConf.getColumnInternalName(fpos);
  }

  public int getPositionFromHiveInternalColumnName(String internalName) {
//    return HiveConf.getPositionFromInternalName(fieldName);
    // The above line should have been all the implementation that
    // we need, but due to a bug in that impl which recognizes
    // only single-digit columns, we need another impl here.
    Pattern internalPattern = Pattern.compile("_col([0-9]+)");
    Matcher m = internalPattern.matcher(internalName);
    if (!m.matches()) {
      return -1;
    } else {
      return Integer.parseInt(m.group(1));
    }
  }

  /**
   * Utility method to extract current expected field from given JsonParser
   *
   * To get the field, we need either a type or a hcatFieldSchema(necessary for complex types)
   * It is possible that one of them can be null, and so, if so, the other is instantiated
   * from the other
   *
   * isTokenCurrent is a boolean variable also passed in, which determines
   * if the JsonParser is already at the token we expect to read next, or
   * needs advancing to the next before we read.
   */
  private Object extractCurrentField(JsonParser p, Type t,
                     HCatFieldSchema hcatFieldSchema, boolean isTokenCurrent) throws IOException, JsonParseException,
    HCatException {
    Object val = null;
    JsonToken valueToken;
    if (isTokenCurrent) {
      valueToken = p.getCurrentToken();
    } else {
      valueToken = p.nextToken();
    }

    if (hcatFieldSchema != null) {
      t = hcatFieldSchema.getType();
    }
    switch (t) {
    case INT:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue();
      break;
    case TINYINT:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue();
      break;
    case SMALLINT:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue();
      break;
    case BIGINT:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue();
      break;
    case BOOLEAN:
      String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
      if (bval != null) {
        val = Boolean.valueOf(bval);
      } else {
        val = null;
      }
      break;
    case FLOAT:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue();
      break;
    case DOUBLE:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue();
      break;
    case STRING:
      val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
      break;
    case BINARY:
      throw new IOException("JsonSerDe does not support BINARY type");
    case ARRAY:
      if (valueToken == JsonToken.VALUE_NULL) {
        val = null;
        break;
      }
      if (valueToken != JsonToken.START_ARRAY) {
        throw new IOException("Start of Array expected");
      }
      List arr = new ArrayList();
      while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) {
        arr.add(extractCurrentField(p, null, hcatFieldSchema.getArrayElementSchema().get(0), true));
      }
      val = arr;
      break;
    case MAP:
      if (valueToken == JsonToken.VALUE_NULL) {
        val = null;
        break;
      }
      if (valueToken != JsonToken.START_OBJECT) {
        throw new IOException("Start of Object expected");
      }
      Map map = new LinkedHashMap();
      Type keyType = hcatFieldSchema.getMapKeyType();
      HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0);
      while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
        Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), keyType);
        Object v;
        if (valueSchema.getType() == HCatFieldSchema.Type.STRUCT) {
          v = extractCurrentField(p, null, valueSchema, false);
        } else {
          v = extractCurrentField(p, null, valueSchema, true);
        }

        map.put(k, v);
      }
      val = map;
      break;
    case STRUCT:
      if (valueToken == JsonToken.VALUE_NULL) {
        val = null;
        break;
      }
      if (valueToken != JsonToken.START_OBJECT) {
        throw new IOException("Start of Object expected");
      }
      HCatSchema subSchema = hcatFieldSchema.getStructSubSchema();
      int sz = subSchema.getFieldNames().size();

      List struct = new ArrayList(Collections.nCopies(sz, null));
      while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
        populateRecord(struct, valueToken, p, subSchema);
      }
      val = struct;
      break;
    }
    return val;
  }

  private Object getObjectOfCorrespondingPrimitiveType(String s, Type t) throws IOException {
    switch (t) {
    case INT:
      return Integer.valueOf(s);
    case TINYINT:
      return Byte.valueOf(s);
    case SMALLINT:
      return Short.valueOf(s);
    case BIGINT:
      return Long.valueOf(s);
    case BOOLEAN:
      return (s.equalsIgnoreCase("true"));
    case FLOAT:
      return Float.valueOf(s);
    case DOUBLE:
      return Double.valueOf(s);
    case STRING:
      return s;
    case BINARY:
      throw new IOException("JsonSerDe does not support BINARY type");
    }
    throw new IOException("Could not convert from string to map type " + t);
  }

  /**
   * Given an object and object inspector pair, traverse the object
   * and generate a Text representation of the object.
   */
  @Override
  public Writable serialize(Object obj, ObjectInspector objInspector)
    throws SerDeException {
    StringBuilder sb = new StringBuilder();
    try {

      StructObjectInspector soi = (StructObjectInspector) objInspector;
      List structFields = soi.getAllStructFieldRefs();
      assert (columnNames.size() == structFields.size());
      if (obj == null) {
        sb.append("null");
      } else {
        sb.append(SerDeUtils.LBRACE);
        for (int i = 0; i < structFields.size(); i++) {
          if (i > 0) {
            sb.append(SerDeUtils.COMMA);
          }
          sb.append(SerDeUtils.QUOTE);
          sb.append(columnNames.get(i));
          sb.append(SerDeUtils.QUOTE);
          sb.append(SerDeUtils.COLON);
          buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)),
            structFields.get(i).getFieldObjectInspector());
        }
        sb.append(SerDeUtils.RBRACE);
      }

    } catch (IOException e) {
      LOG.warn("Error generating json text from object.", e);
      throw new SerDeException(e);
    }
    return new Text(sb.toString());
  }

  // TODO : code section copied over from SerDeUtils because of non-standard json production there
  // should use quotes for all field names. We should fix this there, and then remove this copy.
  // See http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES
  // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure
  // when attempting to use that feature, so having to change the production itself.
  // Also, throws IOException when Binary is detected.
  private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException {

    switch (oi.getCategory()) {
    case PRIMITIVE: {
      PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
      if (o == null) {
        sb.append("null");
      } else {
        switch (poi.getPrimitiveCategory()) {
        case BOOLEAN: {
          boolean b = ((BooleanObjectInspector) poi).get(o);
          sb.append(b ? "true" : "false");
          break;
        }
        case BYTE: {
          sb.append(((ByteObjectInspector) poi).get(o));
          break;
        }
        case SHORT: {
          sb.append(((ShortObjectInspector) poi).get(o));
          break;
        }
        case INT: {
          sb.append(((IntObjectInspector) poi).get(o));
          break;
        }
        case LONG: {
          sb.append(((LongObjectInspector) poi).get(o));
          break;
        }
        case FLOAT: {
          sb.append(((FloatObjectInspector) poi).get(o));
          break;
        }
        case DOUBLE: {
          sb.append(((DoubleObjectInspector) poi).get(o));
          break;
        }
        case STRING: {
          sb.append('"');
          sb.append(SerDeUtils.escapeString(((StringObjectInspector) poi)
            .getPrimitiveJavaObject(o)));
          sb.append('"');
          break;
        }
        case TIMESTAMP: {
          sb.append('"');
          sb.append(((TimestampObjectInspector) poi)
            .getPrimitiveWritableObject(o));
          sb.append('"');
          break;
        }
        case BINARY: {
          throw new IOException("JsonSerDe does not support BINARY type");
        }
        default:
          throw new RuntimeException("Unknown primitive type: "
            + poi.getPrimitiveCategory());
        }
      }
      break;
    }
    case LIST: {
      ListObjectInspector loi = (ListObjectInspector) oi;
      ObjectInspector listElementObjectInspector = loi
        .getListElementObjectInspector();
      List olist = loi.getList(o);
      if (olist == null) {
        sb.append("null");
      } else {
        sb.append(SerDeUtils.LBRACKET);
        for (int i = 0; i < olist.size(); i++) {
          if (i > 0) {
            sb.append(SerDeUtils.COMMA);
          }
          buildJSONString(sb, olist.get(i), listElementObjectInspector);
        }
        sb.append(SerDeUtils.RBRACKET);
      }
      break;
    }
    case MAP: {
      MapObjectInspector moi = (MapObjectInspector) oi;
      ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector();
      ObjectInspector mapValueObjectInspector = moi
        .getMapValueObjectInspector();
      Map omap = moi.getMap(o);
      if (omap == null) {
        sb.append("null");
      } else {
        sb.append(SerDeUtils.LBRACE);
        boolean first = true;
        for (Object entry : omap.entrySet()) {
          if (first) {
            first = false;
          } else {
            sb.append(SerDeUtils.COMMA);
          }
          Map.Entry e = (Map.Entry) entry;
          StringBuilder keyBuilder = new StringBuilder();
          buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector);
          String keyString = keyBuilder.toString().trim();
          boolean doQuoting = (!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE);
          if (doQuoting) {
            sb.append(SerDeUtils.QUOTE);
          }
          sb.append(keyString);
          if (doQuoting) {
            sb.append(SerDeUtils.QUOTE);
          }
          sb.append(SerDeUtils.COLON);
          buildJSONString(sb, e.getValue(), mapValueObjectInspector);
        }
        sb.append(SerDeUtils.RBRACE);
      }
      break;
    }
    case STRUCT: {
      StructObjectInspector soi = (StructObjectInspector) oi;
      List structFields = soi.getAllStructFieldRefs();
      if (o == null) {
        sb.append("null");
      } else {
        sb.append(SerDeUtils.LBRACE);
        for (int i = 0; i < structFields.size(); i++) {
          if (i > 0) {
            sb.append(SerDeUtils.COMMA);
          }
          sb.append(SerDeUtils.QUOTE);
          sb.append(structFields.get(i).getFieldName());
          sb.append(SerDeUtils.QUOTE);
          sb.append(SerDeUtils.COLON);
          buildJSONString(sb, soi.getStructFieldData(o, structFields.get(i)),
            structFields.get(i).getFieldObjectInspector());
        }
        sb.append(SerDeUtils.RBRACE);
      }
      break;
    }
    case UNION: {
      UnionObjectInspector uoi = (UnionObjectInspector) oi;
      if (o == null) {
        sb.append("null");
      } else {
        sb.append(SerDeUtils.LBRACE);
        sb.append(uoi.getTag(o));
        sb.append(SerDeUtils.COLON);
        buildJSONString(sb, uoi.getField(o),
          uoi.getObjectInspectors().get(uoi.getTag(o)));
        sb.append(SerDeUtils.RBRACE);
      }
      break;
    }
    default:
      throw new RuntimeException("Unknown type in ObjectInspector!");
    }
  }


  /**
   *  Returns an object inspector for the specified schema that
   *  is capable of reading in the object representation of the JSON string
   */
  @Override
  public ObjectInspector getObjectInspector() throws SerDeException {
    return cachedObjectInspector;
  }

  @Override
  public Class getSerializedClass() {
    return Text.class;
  }

  @Override
  public SerDeStats getSerDeStats() {
    // no support for statistics yet
    return null;
  }

}