All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.udf.generic.GenericUDTFJSONTuple Maven / Gradle / Ivy

Go to download

Hive is a data warehouse infrastructure built on top of Hadoop see http://wiki.apache.org/hadoop/Hive

There is a newer version: 0.11.0-shark-0.9.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.udf.generic;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser.Feature;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.map.type.TypeFactory;
import org.codehaus.jackson.type.JavaType;

/**
 * GenericUDTFJSONTuple: this
 *
 */
@Description(name = "json_tuple",
    value = "_FUNC_(jsonStr, p1, p2, ..., pn) - like get_json_object, but it takes multiple names and return a tuple. " +
    		"All the input parameters and output column types are string.")

public class GenericUDTFJSONTuple extends GenericUDTF {

  private static Log LOG = LogFactory.getLog(GenericUDTFJSONTuple.class.getName());

  private static final JsonFactory JSON_FACTORY = new JsonFactory();
  static {
    // Allows for unescaped ASCII control characters in JSON values
    JSON_FACTORY.enable(Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
  }
  private static final ObjectMapper MAPPER = new ObjectMapper(JSON_FACTORY);
  private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class);

  int numCols;    // number of output columns
  String[] paths; // array of path expressions, each of which corresponds to a column
  Text[] retCols; // array of returned column values
  Text[] cols;    // object pool of non-null Text, avoid creating objects all the time
  Object[] nullCols; // array of null column values
  ObjectInspector[] inputOIs; // input ObjectInspectors
  boolean pathParsed = false;
  boolean seenErrors = false;

  //An LRU cache using a linked hash map
  static class HashCache extends LinkedHashMap {

    private static final int CACHE_SIZE = 16;
    private static final int INIT_SIZE = 32;
    private static final float LOAD_FACTOR = 0.6f;

    HashCache() {
      super(INIT_SIZE, LOAD_FACTOR);
    }

    private static final long serialVersionUID = 1;

    @Override
    protected boolean removeEldestEntry(Map.Entry eldest) {
      return size() > CACHE_SIZE;
    }

  }

  static Map jsonObjectCache = new HashCache();

  @Override
  public void close() throws HiveException {
  }

  @Override
  public StructObjectInspector initialize(ObjectInspector[] args)
      throws UDFArgumentException {

    inputOIs = args;
    numCols = args.length - 1;

    if (numCols < 1) {
      throw new UDFArgumentException("json_tuple() takes at least two arguments: " +
      		"the json string and a path expression");
    }

    for (int i = 0; i < args.length; ++i) {
      if (args[i].getCategory() != ObjectInspector.Category.PRIMITIVE ||
          !args[i].getTypeName().equals(serdeConstants.STRING_TYPE_NAME)) {
        throw new UDFArgumentException("json_tuple()'s arguments have to be string type");
      }
    }

    seenErrors = false;
    pathParsed = false;
    paths = new String[numCols];
    cols = new Text[numCols];
    retCols = new Text[numCols];
    nullCols = new Object[numCols];

    for (int i = 0; i < numCols; ++i) {
      cols[i] = new Text();
      retCols[i] = cols[i];
      nullCols[i] = null;
    }

    // construct output object inspector
    ArrayList fieldNames = new ArrayList(numCols);
    ArrayList fieldOIs = new ArrayList(numCols);
    for (int i = 0; i < numCols; ++i) {
      // column name can be anything since it will be named by UDTF as clause
      fieldNames.add("c" + i);
      // all returned type will be Text
      fieldOIs.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }
    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
  }

  @SuppressWarnings("unchecked")
  @Override
  public void process(Object[] o) throws HiveException {

    if (o[0] == null) {
      forward(nullCols);
      return;
    }
    // get the path expression for the 1st row only
    if (!pathParsed) {
      for (int i = 0;i < numCols; ++i) {
        paths[i] = ((StringObjectInspector) inputOIs[i+1]).getPrimitiveJavaObject(o[i+1]);
      }
      pathParsed = true;
    }

    String jsonStr = ((StringObjectInspector) inputOIs[0]).getPrimitiveJavaObject(o[0]);
    if (jsonStr == null) {
      forward(nullCols);
      return;
    }
    try {
      Object jsonObj = jsonObjectCache.get(jsonStr);
      if (jsonObj == null) {
        try {
          jsonObj = MAPPER.readValue(jsonStr, MAP_TYPE);
        } catch (Exception e) {
          reportInvalidJson(jsonStr);
          forward(nullCols);
          return;
        }
        jsonObjectCache.put(jsonStr, jsonObj);
      }

      if (!(jsonObj instanceof Map)) {
        reportInvalidJson(jsonStr);
        forward(nullCols);
        return;
      }

      for (int i = 0; i < numCols; ++i) {
        if (retCols[i] == null) {
          retCols[i] = cols[i]; // use the object pool rather than creating a new object
        }
        Object extractObject = ((Map)jsonObj).get(paths[i]);
        if (extractObject instanceof Map || extractObject instanceof List) {
          retCols[i].set(MAPPER.writeValueAsString(extractObject));
        } else if (extractObject != null) {
          retCols[i].set(extractObject.toString());
        } else {
          retCols[i] = null;
        }
      }
      forward(retCols);
      return;
    } catch (Throwable e) {
      LOG.error("JSON parsing/evaluation exception" + e);
      forward(nullCols);
    }
  }

  @Override
  public String toString() {
    return "json_tuple";
  }

  private void reportInvalidJson(String jsonStr) {
    if (!seenErrors) {
      LOG.error("The input is not a valid JSON string: " + jsonStr +
          ". Skipping such error messages in the future.");
      seenErrors = true;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy