org.apache.orc.tools.FileDump Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of orc-tools Show documentation
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc.tools;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.orc.util.BloomFilter;
import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.impl.OrcAcidUtils;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.OrcProto;
import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.apache.orc.impl.RecordReaderImpl;

/**
 * A tool for printing out the file structure of ORC files.
 */
public final class FileDump {
  public static final String UNKNOWN = "UNKNOWN";
  public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n";
  public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
  public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
  public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
    public boolean accept(Path p) {
      String name = p.getName();
      return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
          OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX);
    }
  };

  // not used
  private FileDump() {
  }

  public static void main(Configuration conf, String[] args) throws Exception {
    List rowIndexCols = new ArrayList(0);
    Options opts = createOptions();
    CommandLine cli = new GnuParser().parse(opts, args);

    if (cli.hasOption('h')) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("orcfiledump", opts);
      return;
    }

    boolean dumpData = cli.hasOption('d');
    boolean recover = cli.hasOption("recover");
    boolean skipDump = cli.hasOption("skip-dump");
    String backupPath = DEFAULT_BACKUP_PATH;
    if (cli.hasOption("backup-path")) {
      backupPath = cli.getOptionValue("backup-path");
    }

    if (cli.hasOption("r")) {
      String val = cli.getOptionValue("r");
      if (val != null && val.trim().equals("*")) {
        rowIndexCols = null; // All the columns
      } else {
        String[] colStrs = cli.getOptionValue("r").split(",");
        rowIndexCols = new ArrayList(colStrs.length);
        for (String colStr : colStrs) {
          rowIndexCols.add(Integer.parseInt(colStr));
        }
      }
    }

    boolean printTimeZone = cli.hasOption('t');
    boolean jsonFormat = cli.hasOption('j');
    String[] files = cli.getArgs();
    if (files.length == 0) {
      System.err.println("Error : ORC files are not specified");
      return;
    }

    // if the specified path is directory, iterate through all files and print the file dump
    List filesInPath = new ArrayList<>();
    for (String filename : files) {
      Path path = new Path(filename);
      filesInPath.addAll(getAllFilesInPath(path, conf));
    }

    if (dumpData) {
      PrintData.main(conf, filesInPath.toArray(new String[filesInPath.size()]));
    } else if (recover && skipDump) {
      recoverFiles(filesInPath, conf, backupPath);
    } else {
      if (jsonFormat) {
        boolean prettyPrint = cli.hasOption('p');
        JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
      } else {
        printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
      }
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    main(conf, args);
  }

  /**
   * This method returns an ORC reader object if the specified file is readable. If the specified
   * file has side file (_flush_length) file, then max footer offset will be read from the side
   * file and orc reader will be created from that offset. Since both data file and side file
   * use hflush() for flushing the data, there could be some inconsistencies and both files could be
   * out-of-sync. Following are the cases under which null will be returned
   *
   * 1) If the file specified by path or its side file is still open for writes
   * 2) If *_flush_length file does not return any footer offset
   * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
   *    position (incomplete data file)
   * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
   *    previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
   *    be returned
   *
   * Also, if this method detects any file corruption (mismatch between data file and side file)
   * then it will add the corresponding file to the specified input list for corrupted files.
   *
   * In all other cases, where the file is readable this method will return a reader object.
   *
   * @param path - file to get reader for
   * @param conf - configuration object
   * @param corruptFiles - fills this list with all possible corrupted files
   * @return - reader for the specified file or null
   * @throws IOException
   */
  static Reader getReader(final Path path, final Configuration conf,
      final List corruptFiles) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    long dataFileLen = fs.getFileStatus(path).getLen();
    System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
    Path sideFile = OrcAcidUtils.getSideFile(path);
    final boolean sideFileExists = fs.exists(sideFile);
    boolean openDataFile = false;
    boolean openSideFile = false;
    if (fs instanceof DistributedFileSystem) {
      DistributedFileSystem dfs = (DistributedFileSystem) fs;
      openDataFile = !dfs.isFileClosed(path);
      openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
    }

    if (openDataFile || openSideFile) {
      if (openDataFile && openSideFile) {
        System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
            " are still open for writes.");
      } else if (openSideFile) {
        System.err.println("Unable to perform file dump as " + sideFile +
            " is still open for writes.");
      } else {
        System.err.println("Unable to perform file dump as " + path +
            " is still open for writes.");
      }

      return null;
    }

    Reader reader = null;
    if (sideFileExists) {
      final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path);
      final long sideFileLen = fs.getFileStatus(sideFile).getLen();
      System.err.println("Found flush length file " + sideFile
          + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
      // no offsets read from side file
      if (maxLen == -1) {

        // if data file is larger than last flush length, then additional data could be recovered
        if (dataFileLen > maxLen) {
          System.err.println("Data file has more data than max footer offset:" + maxLen +
              ". Adding data file to recovery list.");
          if (corruptFiles != null) {
            corruptFiles.add(path.toUri().toString());
          }
        }
        return null;
      }

      try {
        reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));

        // if data file is larger than last flush length, then additional data could be recovered
        if (dataFileLen > maxLen) {
          System.err.println("Data file has more data than max footer offset:" + maxLen +
              ". Adding data file to recovery list.");
          if (corruptFiles != null) {
            corruptFiles.add(path.toUri().toString());
          }
        }
      } catch (Exception e) {
        if (corruptFiles != null) {
          corruptFiles.add(path.toUri().toString());
        }
        System.err.println("Unable to read data from max footer offset." +
            " Adding data file to recovery list.");
        return null;
      }
    } else {
      reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    }

    return reader;
  }

  public static Collection getAllFilesInPath(final Path path,
      final Configuration conf) throws IOException {
    List filesInPath = new ArrayList<>();
    FileSystem fs = path.getFileSystem(conf);
    FileStatus fileStatus = fs.getFileStatus(path);
    if (fileStatus.isDir()) {
      FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
      for (FileStatus fileInPath : fileStatuses) {
        if (fileInPath.isDir()) {
          filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
        } else {
          filesInPath.add(fileInPath.getPath().toString());
        }
      }
    } else {
      filesInPath.add(path.toString());
    }

    return filesInPath;
  }

  private static void printMetaData(List files, Configuration conf,
      List rowIndexCols, boolean printTimeZone, final boolean recover,
      final String backupPath)
      throws IOException {
    List corruptFiles = new ArrayList<>();
    for (String filename : files) {
      printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
      System.out.println(SEPARATOR);
    }

    if (!corruptFiles.isEmpty()) {
      if (recover) {
        recoverFiles(corruptFiles, conf, backupPath);
      } else {
        System.err.println(corruptFiles.size() + " file(s) are corrupted." +
            " Run the following command to recover corrupted files.\n");
        StringBuilder buffer = new StringBuilder();
        buffer.append("hive --orcfiledump --recover --skip-dump");
        for(String file: corruptFiles) {
          buffer.append(' ');
          buffer.append(file);
        }
        System.err.println(buffer.toString());
        System.out.println(SEPARATOR);
      }
    }
  }

  private static void printMetaDataImpl(final String filename,
      final Configuration conf, List rowIndexCols, final boolean printTimeZone,
      final List corruptFiles) throws IOException {
    Path file = new Path(filename);
    Reader reader = getReader(file, conf, corruptFiles);
    // if we can create reader then footer is not corrupt and file will readable
    if (reader == null) {
      return;
    }
    TypeDescription schema = reader.getSchema();
    System.out.println("Structure for " + filename);
    System.out.println("File Version: " + reader.getFileVersion().getName() +
        " with " + reader.getWriterVersion());
    RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
    System.out.println("Rows: " + reader.getNumberOfRows());
    System.out.println("Compression: " + reader.getCompressionKind());
    if (reader.getCompressionKind() != CompressionKind.NONE) {
      System.out.println("Compression size: " + reader.getCompressionSize());
    }
    System.out.println("Type: " + reader.getSchema().toString());
    System.out.println("\nStripe Statistics:");
    List stripeStats = reader.getStripeStatistics();
    for (int n = 0; n < stripeStats.size(); n++) {
      System.out.println("  Stripe " + (n + 1) + ":");
      StripeStatistics ss = stripeStats.get(n);
      for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
        System.out.println("    Column " + i + ": " +
            ss.getColumnStatistics()[i].toString());
      }
    }
    ColumnStatistics[] stats = reader.getStatistics();
    int colCount = stats.length;
    if (rowIndexCols == null) {
      rowIndexCols = new ArrayList<>(colCount);
      for (int i = 0; i < colCount; ++i) {
        rowIndexCols.add(i);
      }
    }
    System.out.println("\nFile Statistics:");
    for (int i = 0; i < stats.length; ++i) {
      System.out.println("  Column " + i + ": " + stats[i].toString());
    }
    System.out.println("\nStripes:");
    int stripeIx = -1;
    for (StripeInformation stripe : reader.getStripes()) {
      ++stripeIx;
      long stripeStart = stripe.getOffset();
      OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
      if (printTimeZone) {
        String tz = footer.getWriterTimezone();
        if (tz == null || tz.isEmpty()) {
          tz = UNKNOWN;
        }
        System.out.println("  Stripe: " + stripe.toString() + " timezone: " + tz);
      } else {
        System.out.println("  Stripe: " + stripe.toString());
      }
      long sectionStart = stripeStart;
      for (OrcProto.Stream section : footer.getStreamsList()) {
        String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
        System.out.println("    Stream: column " + section.getColumn() +
            " section " + kind + " start: " + sectionStart +
            " length " + section.getLength());
        sectionStart += section.getLength();
      }
      for (int i = 0; i < footer.getColumnsCount(); ++i) {
        OrcProto.ColumnEncoding encoding = footer.getColumns(i);
        StringBuilder buf = new StringBuilder();
        buf.append("    Encoding column ");
        buf.append(i);
        buf.append(": ");
        buf.append(encoding.getKind());
        if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
            encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
          buf.append("[");
          buf.append(encoding.getDictionarySize());
          buf.append("]");
        }
        System.out.println(buf);
      }
      if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
        // include the columns that are specified, only if the columns are included, bloom filter
        // will be read
        boolean[] sargColumns = new boolean[colCount];
        for (int colIdx : rowIndexCols) {
          sargColumns[colIdx] = true;
        }
        OrcIndex indices = rows
            .readRowIndex(stripeIx, null, null, null, sargColumns);
        for (int col : rowIndexCols) {
          StringBuilder buf = new StringBuilder();
          String rowIdxString = getFormattedRowIndices(col,
              indices.getRowGroupIndex(), schema);
          buf.append(rowIdxString);
          String bloomFilString = getFormattedBloomFilters(col, indices,
              reader.getWriterVersion(),
              reader.getSchema().findSubtype(col).getCategory(),
              footer.getColumns(col));
          buf.append(bloomFilString);
          System.out.println(buf);
        }
      }
    }

    FileSystem fs = file.getFileSystem(conf);
    long fileLen = fs.getFileStatus(file).getLen();
    long paddedBytes = getTotalPaddingSize(reader);
    // empty ORC file is ~45 bytes. Assumption here is file length always >0
    double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
    DecimalFormat format = new DecimalFormat("##.##");
    System.out.println("\nFile length: " + fileLen + " bytes");
    System.out.println("Padding length: " + paddedBytes + " bytes");
    System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
    //print out any user metadata properties
    List keys = reader.getMetadataKeys();
    for(int i = 0; i < keys.size(); i++) {
      if(i == 0) {
        System.out.println("\nUser Metadata:");
      }
      ByteBuffer byteBuffer = reader.getMetadataValue(keys.get(i));
      System.out.println("  " + keys.get(i) + "="
        + StandardCharsets.UTF_8.decode(byteBuffer));
    }
    rows.close();
  }

  private static void recoverFiles(final List corruptFiles, final Configuration conf,
      final String backup)
      throws IOException {
    for (String corruptFile : corruptFiles) {
      System.err.println("Recovering file " + corruptFile);
      Path corruptPath = new Path(corruptFile);
      FileSystem fs = corruptPath.getFileSystem(conf);
      FSDataInputStream fdis = fs.open(corruptPath);
      try {
        long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
        long remaining = corruptFileLen;
        List footerOffsets = new ArrayList<>();

        // start reading the data file form top to bottom and record the valid footers
        while (remaining > 0) {
          int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
          byte[] data = new byte[toRead];
          long startPos = corruptFileLen - remaining;
          fdis.readFully(startPos, data, 0, toRead);

          // find all MAGIC string and see if the file is readable from there
          int index = 0;
          long nextFooterOffset;
          byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
          while (index != -1) {
            index = indexOf(data, magicBytes, index + 1);
            if (index != -1) {
              nextFooterOffset = startPos + index + magicBytes.length + 1;
              if (isReadable(corruptPath, conf, nextFooterOffset)) {
                footerOffsets.add(nextFooterOffset);
              }
            }
          }

          System.err.println("Scanning for valid footers - startPos: " + startPos +
              " toRead: " + toRead + " remaining: " + remaining);
          remaining = remaining - toRead;
        }

        System.err.println("Readable footerOffsets: " + footerOffsets);
        recoverFile(corruptPath, fs, conf, footerOffsets, backup);
      } catch (Exception e) {
        Path recoveryFile = getRecoveryFile(corruptPath);
        if (fs.exists(recoveryFile)) {
          fs.delete(recoveryFile, false);
        }
        System.err.println("Unable to recover file " + corruptFile);
        e.printStackTrace();
        System.err.println(SEPARATOR);
        continue;
      } finally {
        fdis.close();
      }
      System.err.println(corruptFile + " recovered successfully!");
      System.err.println(SEPARATOR);
    }
  }

  private static void recoverFile(final Path corruptPath, final FileSystem fs,
      final Configuration conf, final List footerOffsets, final String backup)
      throws IOException {

    // first recover the file to .recovered file and then once successful rename it to actual file
    Path recoveredPath = getRecoveryFile(corruptPath);

    // make sure that file does not exist
    if (fs.exists(recoveredPath)) {
      fs.delete(recoveredPath, false);
    }

    // if there are no valid footers, the file should still be readable so create an empty orc file
    if (footerOffsets == null || footerOffsets.isEmpty()) {
      System.err.println("No readable footers found. Creating empty orc file.");
      TypeDescription schema = TypeDescription.createStruct();
      Writer writer = OrcFile.createWriter(recoveredPath,
          OrcFile.writerOptions(conf).setSchema(schema));
      writer.close();
    } else {
      FSDataInputStream fdis = fs.open(corruptPath);
      FileStatus fileStatus = fs.getFileStatus(corruptPath);
      // read corrupt file and copy it to recovered file until last valid footer
      FSDataOutputStream fdos = fs.create(recoveredPath, true,
          conf.getInt("io.file.buffer.size", 4096),
          fileStatus.getReplication(),
          fileStatus.getBlockSize());
      try {
        long fileLen = footerOffsets.get(footerOffsets.size() - 1);
        long remaining = fileLen;

        while (remaining > 0) {
          int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
          byte[] data = new byte[toRead];
          long startPos = fileLen - remaining;
          fdis.readFully(startPos, data, 0, toRead);
          fdos.write(data);
          System.err.println("Copying data to recovery file - startPos: " + startPos +
              " toRead: " + toRead + " remaining: " + remaining);
          remaining = remaining - toRead;
        }
      } catch (Exception e) {
        fs.delete(recoveredPath, false);
        throw new IOException(e);
      } finally {
        fdis.close();
        fdos.close();
      }
    }

    // validate the recovered file once again and start moving corrupt files to backup folder
    if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
      Path backupDataPath;
      String scheme = corruptPath.toUri().getScheme();
      String authority = corruptPath.toUri().getAuthority();
      String filePath = corruptPath.toUri().getPath();

      // use the same filesystem as corrupt file if backup-path is not explicitly specified
      if (backup.equals(DEFAULT_BACKUP_PATH)) {
        backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
      } else {
        backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
      }

      // Move data file to backup path
      moveFiles(fs, corruptPath, backupDataPath);

      // Move side file to backup path
      Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath);
      Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
      moveFiles(fs, sideFilePath, backupSideFilePath);

      // finally move recovered file to actual file
      moveFiles(fs, recoveredPath, corruptPath);

      // we are done recovering, backing up and validating
      System.err.println("Validation of recovered file successful!");
    }
  }

  private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
      throws IOException {
    try {
      // create the dest directory if not exist
      if (!fs.exists(dest.getParent())) {
        fs.mkdirs(dest.getParent());
      }

      // if the destination file exists for some reason delete it
      fs.delete(dest, false);

      if (fs.rename(src, dest)) {
        System.err.println("Moved " + src + " to " + dest);
      } else {
        throw new IOException("Unable to move " + src + " to " + dest);
      }

    } catch (Exception e) {
      throw new IOException("Unable to move " + src + " to " + dest, e);
    }
  }

  private static Path getRecoveryFile(final Path corruptPath) {
    return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
  }

  private static boolean isReadable(final Path corruptPath, final Configuration conf,
      final long maxLen) {
    try {
      OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
      return true;
    } catch (Exception e) {
      // ignore this exception as maxLen is unreadable
      return false;
    }
  }

  // search for byte pattern in another byte array
  private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
    if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
        index > data.length || index < 0) {
      return -1;
    }

    int j = 0;
    for (int i = index; i < data.length; i++) {
      if (pattern[j] == data[i]) {
        j++;
      } else {
        j = 0;
      }

      if (j == pattern.length) {
        return i - pattern.length + 1;
      }
    }

    return -1;
  }

  private static String getFormattedBloomFilters(int col, OrcIndex index,
                                                 OrcFile.WriterVersion version,
                                                 TypeDescription.Category type,
                                                 OrcProto.ColumnEncoding encoding) {
    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
    StringBuilder buf = new StringBuilder();
    BloomFilter stripeLevelBF = null;
    if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
      int idx = 0;
      buf.append("\n    Bloom filters for column ").append(col).append(":");
      for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
        BloomFilter toMerge = BloomFilterIO.deserialize(
            index.getBloomFilterKinds()[col], encoding, version, type, bf);
        buf.append("\n      Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
        if (stripeLevelBF == null) {
          stripeLevelBF = toMerge;
        } else {
          stripeLevelBF.merge(toMerge);
        }
      }
      String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
      buf.append("\n      Stripe level merge:").append(bloomFilterStats);
    }
    return buf.toString();
  }

  private static String getBloomFilterStats(BloomFilter bf) {
    StringBuilder sb = new StringBuilder();
    int bitCount = bf.getBitSize();
    int popCount = 0;
    for (long l : bf.getBitSet()) {
      popCount += Long.bitCount(l);
    }
    int k = bf.getNumHashFunctions();
    float loadFactor = (float) popCount / (float) bitCount;
    float expectedFpp = (float) Math.pow(loadFactor, k);
    DecimalFormat df = new DecimalFormat("###.####");
    sb.append(" numHashFunctions: ").append(k);
    sb.append(" bitCount: ").append(bitCount);
    sb.append(" popCount: ").append(popCount);
    sb.append(" loadFactor: ").append(df.format(loadFactor));
    sb.append(" expectedFpp: ").append(expectedFpp);
    return sb.toString();
  }

  private static String getFormattedRowIndices(int col,
                                               OrcProto.RowIndex[] rowGroupIndex,
                                               TypeDescription schema) {
    StringBuilder buf = new StringBuilder();
    OrcProto.RowIndex index;
    buf.append("    Row group indices for column ").append(col).append(":");
    if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
        ((index = rowGroupIndex[col]) == null)) {
      buf.append(" not found\n");
      return buf.toString();
    }

    TypeDescription colSchema = schema.findSubtype(col);
    for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
      buf.append("\n      Entry ").append(entryIx).append(": ");
      OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
      if (entry == null) {
        buf.append("unknown\n");
        continue;
      }
      OrcProto.ColumnStatistics colStats = entry.getStatistics();
      if (colStats == null) {
        buf.append("no stats at ");
      } else {
        ColumnStatistics cs =
            ColumnStatisticsImpl.deserialize(colSchema, colStats);
        buf.append(cs.toString());
      }
      buf.append(" positions: ");
      for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
        if (posIx != 0) {
          buf.append(",");
        }
        buf.append(entry.getPositions(posIx));
      }
    }
    return buf.toString();
  }

  public static long getTotalPaddingSize(Reader reader) throws IOException {
    long paddedBytes = 0;
    List stripes = reader.getStripes();
    for (int i = 1; i < stripes.size(); i++) {
      long prevStripeOffset = stripes.get(i - 1).getOffset();
      long prevStripeLen = stripes.get(i - 1).getLength();
      paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
    }
    return paddedBytes;
  }

  @SuppressWarnings("static-access")
  static Options createOptions() {
    Options result = new Options();

    // add -d and --data to print the rows
    result.addOption(OptionBuilder
        .withLongOpt("data")
        .withDescription("Should the data be printed")
        .create('d'));

    // to avoid breaking unit tests (when run in different time zones) for file dump, printing
    // of timezone is made optional
    result.addOption(OptionBuilder
        .withLongOpt("timezone")
        .withDescription("Print writer's time zone")
        .create('t'));

    result.addOption(OptionBuilder
        .withLongOpt("help")
        .withDescription("print help message")
        .create('h'));

    result.addOption(OptionBuilder
        .withLongOpt("rowindex")
        .withArgName("comma separated list of column ids for which row index should be printed")
        .withDescription("Dump stats for column number(s)")
        .hasArg()
        .create('r'));

    result.addOption(OptionBuilder
        .withLongOpt("json")
        .withDescription("Print metadata in JSON format")
        .create('j'));

    result.addOption(OptionBuilder
        .withLongOpt("pretty")
        .withDescription("Pretty print json metadata output")
        .create('p'));

    result.addOption(OptionBuilder
        .withLongOpt("recover")
        .withDescription("recover corrupted orc files generated by streaming")
        .create());

    result.addOption(OptionBuilder
        .withLongOpt("skip-dump")
        .withDescription("used along with --recover to directly recover files without dumping")
        .create());

    result.addOption(OptionBuilder
        .withLongOpt("backup-path")
        .withDescription("specify a backup path to store the corrupted files (default: /tmp)")
        .hasArg()
        .create());
    return result;
  }

}