com.klarna.hiverunner.data.TsvFileParser Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of hiverunner Show documentation

HiveRunner is a unit test framework based on JUnit (4 or 5) that enables TDD development of Hive SQL without the need of any installed dependencies.

There is a newer version: 6.0.1

Show newest version

package com.klarna.hiverunner.data;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang3.ObjectUtils;
import org.apache.hive.hcatalog.data.schema.HCatSchema;

import com.google.common.base.Splitter;

/**
 * A {@link FileParser} for parsing data out of a TSV file.
 */
public class TsvFileParser implements FileParser {

  private static final String DEFAULT_DELIMITER = "\t";
  private static final String DEFAULT_NULL_VALUE = "";

  private Splitter splitter;
  private Object nullValue;
  private Charset charset;

  public TsvFileParser() {
    withDelimiter(DEFAULT_DELIMITER);
    withNullValue(DEFAULT_NULL_VALUE);
    withCharset(StandardCharsets.UTF_8);
  }

  /**
   * Use the provided delimiter. The default is a tab.
   */
  public TsvFileParser withDelimiter(String delimiter) {
    splitter = Splitter.on(delimiter);
    return this;
  }

  /**
   * Use the provided null value. When a column's value equals the null value it will be replaced with null. The default
   * is an empty string.
   */
  public TsvFileParser withNullValue(Object nullValue) {
    this.nullValue = nullValue;
    return this;
  }

  /**
   * Use the provided {@link Charset}. The default is UTF-8.
   */
  public TsvFileParser withCharset(Charset charset) {
    this.charset = charset;
    return this;
  }

  @Override
  public List parse(File file, HCatSchema schema, List names) {
    try {
      List lines = Files.readAllLines(file.toPath(), charset);
      List records = new ArrayList<>(lines.size());
      for (String line : lines) {
        records.add(parseRow(line, names.size()));
      }
      return records;
    } catch (IOException e) {
      throw new RuntimeException("Error while reading file", e);
    }
  }

  private Object[] parseRow(String line, int size) {
    List