All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.load.JsonLoader Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.load;

import com.google.common.collect.Maps;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Map;

/**
 * Decodes each line as JSON passes the resulting map of values
 * to Pig as a single-element tuple.
 * 
 * 

This Loader supports loading of nested JSON structures, but this feature * is disabled by default. There are two ways to enable it:

    *
  • setting the -nestedLoad option in the * {@link JsonLoader#JsonLoader(String)} constructor *
  • setting the elephantbird.jsonloader.nestedLoad * property to true. This can be done with the following pig command: *
    grunt> set elephantbird.jsonloader.nestedLoad 'true'
    *
*/ public class JsonLoader extends LzoBaseLoadFunc { private static final Logger LOG = LoggerFactory.getLogger(JsonLoader.class); private static final TupleFactory tupleFactory = TupleFactory.getInstance(); public static final String NESTED_LOAD_KEY = "elephantbird.jsonloader.nestedLoad"; private final static Options validOptions_ = new Options(); private final static CommandLineParser parser_ = new GnuParser(); private final CommandLine configuredOptions_; private final JSONParser jsonParser = new JSONParser(); private enum JsonLoaderCounters { LinesRead, LinesJsonDecoded, LinesParseError, LinesParseErrorBadNumber } private String inputFormatClassName; private boolean isNestedLoadEnabled = false; private static void populateValidOptions() { validOptions_.addOption("nestedLoad", false, "Enables loading of " + "nested JSON structures"); validOptions_.addOption("inputFormat", true, "The input format class name" + " used by this loader instance"); } public JsonLoader() { // defaults to no options this(""); } /** * Constructor. Construct a JsonLoader LoadFunc to load. * @param inputFormatClassName the inputFormat class name * * @param optString Loader options. Available options:
    *
  • -nestedLoad==(true|false) Enables loading of nested JSON * structures. When enabled, JSON objects are loaded as nested Maps * and JSON arrays are loaded as Bags. *
  • -inputFormat=className The input format class name used * by this loader instance. *
*/ public JsonLoader(String optString) { populateValidOptions(); String[] optsArr = optString.split(" "); try { configuredOptions_ = parser_.parse(validOptions_, optsArr); } catch (org.apache.commons.cli.ParseException e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "[-nestedLoad] [-inputFormat]", validOptions_ ); throw new RuntimeException(e); } isNestedLoadEnabled = configuredOptions_.hasOption("nestedLoad"); if (configuredOptions_.getOptionValue("inputFormat") != null) { this.inputFormatClassName = configuredOptions_.getOptionValue("inputFormat"); } else { this.inputFormatClassName = TextInputFormat.class.getName(); } } public void setInputFormatClassName(String inputFormatClassName) { this.inputFormatClassName = inputFormatClassName; } /** * Return every non-null line as a single-element tuple to Pig. */ @Override public Tuple getNext() throws IOException { if (reader == null) { return null; } // nested load can be enabled through a pig property if ("true".equals(jobConf.get(NESTED_LOAD_KEY))) isNestedLoadEnabled = true; try { while (reader.nextKeyValue()) { Text value = (Text) reader.getCurrentValue(); incrCounter(JsonLoaderCounters.LinesRead, 1L); Tuple t = parseStringToTuple(value.toString()); if (t != null) { incrCounter(JsonLoaderCounters.LinesJsonDecoded, 1L); return t; } } return null; } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } @Override public InputFormat getInputFormat() throws IOException { try { return (FileInputFormat) PigContext.resolveClassName(inputFormatClassName).newInstance(); } catch (InstantiationException e) { throw new IOException("Failed creating input format " + inputFormatClassName, e); } catch (IllegalAccessException e) { throw new IOException("Failed creating input format " + inputFormatClassName, e); } } protected Tuple parseStringToTuple(String line) { try { JSONObject jsonObj = (JSONObject) jsonParser.parse(line); if (jsonObj != null) { return tupleFactory.newTuple(walkJson(jsonObj)); } else { // JSONParser#parse(String) may return a null reference, e.g. when // the input parameter is the string "null". A single line with // "null" is not valid JSON though. LOG.warn("Could not json-decode string: " + line); incrCounter(JsonLoaderCounters.LinesParseError, 1L); return null; } } catch (ParseException e) { LOG.warn("Could not json-decode string: " + line, e); incrCounter(JsonLoaderCounters.LinesParseError, 1L); return null; } catch (NumberFormatException e) { LOG.warn("Very big number exceeds the scale of long: " + line, e); incrCounter(JsonLoaderCounters.LinesParseErrorBadNumber, 1L); return null; } catch (ClassCastException e) { LOG.warn("Could not convert to Json Object: " + line, e); incrCounter(JsonLoaderCounters.LinesParseError, 1L); return null; } } private Object wrap(Object value) { if (isNestedLoadEnabled && value instanceof JSONObject) { return walkJson((JSONObject) value); } else if (isNestedLoadEnabled && value instanceof JSONArray) { JSONArray a = (JSONArray) value; DataBag mapValue = new NonSpillableDataBag(a.size()); for (int i=0; i walkJson(JSONObject jsonObj) { Map v = Maps.newHashMap(); for (Object key: jsonObj.keySet()) { v.put(key.toString(), wrap(jsonObj.get(key))); } return v; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy