org.apache.druid.data.input.parquet.ParquetToJson Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-parquet-extensions Show documentation
druid-parquet-extensions
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.data.input.parquet;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SequenceWriter;
import com.github.rvesse.airline.Cli;
import com.github.rvesse.airline.annotations.Arguments;
import com.github.rvesse.airline.annotations.Command;
import com.github.rvesse.airline.annotations.Option;
import com.github.rvesse.airline.builder.CliBuilder;
import org.apache.druid.data.input.parquet.simple.ParquetGroupConverter;
import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.java.util.common.IAE;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.hadoop.example.GroupReadSupport;

import java.io.File;
import java.util.List;
import java.util.concurrent.Callable;

/**
 * Converts parquet files into new-deliminated JSON object files. Takes a single
 * argument (an input directory) and processes all files that end with a
 * ".parquet" extension. Writes out a new file in the same directory named by
 * appending ".json" to the old file name. Will overwrite any output file that
 * already exists.
 */
@Command(name = "ParquetToJson")
public class ParquetToJson implements Callable
{

  @Option(name = "--convert-corrupt-dates")
  public boolean convertCorruptDates = false;

  @Arguments(description = "directory")
  public List directories;


  public static void main(String[] args) throws Exception
  {
    CliBuilder builder = Cli.builder("ParquetToJson");
    builder.withDefaultCommand(ParquetToJson.class);
    builder.build().parse(args).call();
  }

  private File[] getInputFiles()
  {
    if (directories == null || directories.size() != 1) {
      throw new IAE("Only one directory argument is supported!");
    }

    File dir = new File(directories.get(0));
    if (!dir.isDirectory()) {
      throw new IAE("Not a directory [%s]", dir);
    }
    File[] inputFiles = dir.listFiles(
        pathname -> pathname.getName().endsWith(".parquet"));
    if (inputFiles == null || inputFiles.length == 0) {
      throw new IAE("No parquet files in directory [%s]", dir);
    }
    return inputFiles;
  }

  @Override
  public Void call() throws Exception
  {
    ObjectMapper mapper = new DefaultObjectMapper();

    File[] inputFiles = getInputFiles();

    for (File inputFile : inputFiles) {
      File outputFile = new File(inputFile.getAbsolutePath() + ".json");

      try (
          final org.apache.parquet.hadoop.ParquetReader reader = org.apache.parquet.hadoop.ParquetReader
              .builder(new GroupReadSupport(), new Path(inputFile.toURI()))
              .build();
          final SequenceWriter writer = mapper.writer().withRootValueSeparator("\n").writeValues(outputFile)) {
        ParquetGroupConverter converter = new ParquetGroupConverter(true, convertCorruptDates);
        Group group;
        while ((group = reader.read()) != null) {
          writer.write(converter.convertGroup(group));
        }
      }
    }
    return null;
  }
}