org.apache.hadoop.hive.serde2.json.HiveJsonReader Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2.json;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.common.type.TimestampTZ;
import org.apache.hadoop.hive.serde2.JsonSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TimestampLocalTZTypeInfo;
import org.apache.hive.common.util.TimestampParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.JsonNodeType;
import com.fasterxml.jackson.databind.node.TextNode;
import com.google.common.base.Preconditions;
/**
* This class converts JSON strings into Java or Hive Primitive objects.
*
* Support types are:
*
*
*
* JSON Type
* Java Type
* Notes
*
*
* Object
* java.util.List
* Each element may be different type
*
*
* Array
* java.util.List
* Each element is same type
*
*
* Map
* java.util.Map
* Keys must be same primitive type; every value is the same type
*
*
*/
public class HiveJsonReader {
private static final Logger LOG =
LoggerFactory.getLogger(HiveJsonReader.class);
private final Map, StructField> discoveredFields =
new HashMap<>();
private final Set> discoveredUnknownFields =
new HashSet<>();
private final EnumSet features = EnumSet.noneOf(Feature.class);
private final ObjectMapper objectMapper;
private final TimestampParser tsParser;
private BinaryEncoding binaryEncoding;
private final ObjectInspector oi;
/**
* Enumeration that defines all on/off features for this reader.
*
* - {@link #COL_INDEX_PARSING}
* - {@link #PRIMITIVE_TO_WRITABLE}
* - {@link #IGNORE_UNKNOWN_FIELDS}
*
*/
public enum Feature {
/**
* Enables an optimization to look up each JSON field based on its index in
* the Hive schema.
*/
COL_INDEX_PARSING,
/**
* If this feature is enabled, when a JSON node is parsed, its value will be
* returned as a Hadoop Writable object. Otherwise, the Java native value is
* returned.
*/
PRIMITIVE_TO_WRITABLE,
/**
* If the JSON object being parsed includes a field that is not included in
* the Hive schema, enabling this feature will cause the JSON reader to
* produce a log warnings. If this feature is disabled, an Exception will be
* thrown and parsing will stop.
*/
IGNORE_UNKNOWN_FIELDS,
/**
* If the JSON object being parsed includes a complex field with non defined Hive schema,
* enabling this feature will cause the JSON reader to treat the field as a String.
* If the feature is disabled, an Exception will be thrown and parsing will stop.
*/
STRINGIFY_COMPLEX_FIELDS
}
/**
* Constructor with default the Hive default timestamp parser.
*
* @param oi ObjectInspector for all the fields in the JSON object
*/
public HiveJsonReader(ObjectInspector oi) {
this(oi, new TimestampParser());
}
/**
* Constructor with default the Hive default timestamp parser.
*
* @param oi ObjectInspector info for all the fields in the JSON object
* @param tsParser Custom timestamp parser
*/
public HiveJsonReader(ObjectInspector oi, TimestampParser tsParser) {
this.binaryEncoding = BinaryEncoding.BASE64;
this.tsParser = tsParser;
this.oi = oi;
this.objectMapper = new ObjectMapper();
}
/**
* Parse text containing a complete JSON object.
*
* @param text The text to parse
* @return A List of Objects, one for each field in the JSON object
* @throws IOException Unable to parse the JSON text
* @throws SerDeException The SerDe is not configured correctly
*/
public Object parseStruct(final String text)
throws IOException, SerDeException {
Preconditions.checkNotNull(text);
Preconditions.checkState(this.oi != null);
final JsonNode rootNode = this.objectMapper.reader().readTree(text);
return visitNode(rootNode, this.oi);
}
/**
* Parse text containing a complete JSON object.
*
* @param in The InputStream to read the text from
* @return A List of Objects, one for each field in the JSON object
* @throws IOException Unable to parse the JSON text
* @throws SerDeException The SerDe is not configured correctly
*/
public Object parseStruct(final InputStream in)
throws IOException, SerDeException {
Preconditions.checkNotNull(in);
Preconditions.checkState(this.oi != null);
final JsonNode rootNode = this.objectMapper.reader().readTree(in);
return visitNode(rootNode, this.oi);
}
/**
* Visit a node and parse it based on the provided ObjectInspector.
*
* @param rootNode The root node to process
* @param oi The ObjectInspector to use
* @return The value in this node. Return value may be null, primitive, and
* may be a complex type if nested.
* @throws SerDeException The SerDe is not configured correctly
*/
private Object visitNode(final JsonNode rootNode, final ObjectInspector oi)
throws SerDeException {
if (!rootNode.isNull()) {
switch (oi.getCategory()) {
case PRIMITIVE:
final Object value = visitLeafNode(rootNode, oi);
return optionallyWrapWritable(value, oi);
case LIST:
return visitArrayNode(rootNode, oi);
case STRUCT:
return visitStructNode(rootNode, oi);
case MAP:
return visitMapNode(rootNode, oi);
default:
throw new SerDeException(
"Parsing of: " + oi.getCategory() + " is not supported");
}
}
return null;
}
/**
* The typical usage of this SerDe requires that it return Hadoop Writable
* objects. However, some uses of this SerDe want the return values to be Java
* primitive objects. This SerDe works explicitly in Java primitive objects
* and will wrap the objects in Writable containers if required.
*
* @param value The Java primitive object to wrap
* @param oi The ObjectInspector provides the type to wrap into
* @return A Hadoop Writable if required; otherwise the object itself
*/
private Object optionallyWrapWritable(final Object value,
final ObjectInspector oi) {
if (!isEnabled(Feature.PRIMITIVE_TO_WRITABLE)) {
return value;
}
final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
final PrimitiveTypeInfo typeInfo = poi.getTypeInfo();
return PrimitiveObjectInspectorFactory
.getPrimitiveJavaObjectInspector(typeInfo)
.getPrimitiveWritableObject(value);
}
/**
* Visit a node if it is expected to be a Map (a.k.a. JSON Object)
*
* @param rootNode The node pointing at the JSON object
* @param oi The ObjectInspector to parse the Map (must be a
* MapObjectInspector)
* @return A Java Map containing the contents of the JSON map
* @throws SerDeException The SerDe is not configured correctly
*/
private Map
© 2015 - 2024 Weber Informatics LLC | Privacy Policy