org.apache.orc.tools.json.JsonShredder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of orc-tools Show documentation
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc.tools.json;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;
import com.google.gson.JsonStreamParser;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;

/**
 * This class takes a set of JSON documents and shreds them into a file per
 * a primitive column. This is useful when trying to understand a set of
 * documents by providing sample values for each of the columns.
 *
 * For example, a document that looks like:
 * {'a': 'aaaa', 'b': { 'c': 12, 'd': true}, e: 'eeee'}
 *
 * Will produce 4 files with the given contents:
 * root.a: aaaa
 * root.b.c: 12
 * root.b.d: true
 * root.e: eeee
 */
public class JsonShredder {

  private final Map files =
      new HashMap();

  private PrintStream getFile(String name) throws IOException {
    PrintStream result = files.get(name);
    if (result == null) {
      result = new PrintStream(new FileOutputStream(name + ".txt"), false,
          StandardCharsets.UTF_8.name());
      files.put(name, result);
    }
    return result;
  }

  private void shredObject(String name, JsonElement json) throws IOException {
    if (json.isJsonPrimitive()) {
      JsonPrimitive primitive = (JsonPrimitive) json;
      getFile(name).println(primitive.getAsString());
    } else if (json.isJsonNull()) {
      // just skip it
    } else if (json.isJsonArray()) {
      for(JsonElement child: ((JsonArray) json)) {
        shredObject(name + ".list", child);
      }
    } else {
      JsonObject obj = (JsonObject) json;
      for(Map.Entry field: obj.entrySet()) {
        String fieldName = field.getKey();
        shredObject(name + "." + fieldName, field.getValue());
      }
    }
  }

  private void close() throws IOException {
    for(Map.Entry file: files.entrySet()) {
      file.getValue().close();
    }
  }

  public static void main(String[] args) throws Exception {
    int count = 0;
    JsonShredder shredder = new JsonShredder();
    for (String filename: args) {
      System.out.println("Reading " + filename);
      System.out.flush();
      java.io.Reader reader;
      FileInputStream inStream = new FileInputStream(filename);
      if (filename.endsWith(".gz")) {
        reader = new InputStreamReader(new GZIPInputStream(inStream),
            StandardCharsets.UTF_8);
      } else {
        reader = new InputStreamReader(inStream, StandardCharsets.UTF_8);
      }
      JsonStreamParser parser = new JsonStreamParser(reader);
      while (parser.hasNext()) {
        count += 1;
        JsonElement item = parser.next();
        shredder.shredObject("root", item);
      }
    }
    shredder.close();
    System.out.println(count + " records read");
    System.out.println();
  }
}