All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.klarna.hiverunner.data.TsvFileParser Maven / Gradle / Ivy

Go to download

HiveRunner is a unit test framework based on JUnit (4 or 5) that enables TDD development of Hive SQL without the need of any installed dependencies.

There is a newer version: 6.0.1
Show newest version
package com.klarna.hiverunner.data;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang3.ObjectUtils;
import org.apache.hive.hcatalog.data.schema.HCatSchema;

import com.google.common.base.Splitter;

/**
 * A {@link FileParser} for parsing data out of a TSV file.
 */
public class TsvFileParser implements FileParser {

  private static final String DEFAULT_DELIMITER = "\t";
  private static final String DEFAULT_NULL_VALUE = "";

  private Splitter splitter;
  private Object nullValue;
  private Charset charset;

  public TsvFileParser() {
    withDelimiter(DEFAULT_DELIMITER);
    withNullValue(DEFAULT_NULL_VALUE);
    withCharset(StandardCharsets.UTF_8);
  }

  /**
   * Use the provided delimiter. The default is a tab.
   */
  public TsvFileParser withDelimiter(String delimiter) {
    splitter = Splitter.on(delimiter);
    return this;
  }

  /**
   * Use the provided null value. When a column's value equals the null value it will be replaced with null. The default
   * is an empty string.
   */
  public TsvFileParser withNullValue(Object nullValue) {
    this.nullValue = nullValue;
    return this;
  }

  /**
   * Use the provided {@link Charset}. The default is UTF-8.
   */
  public TsvFileParser withCharset(Charset charset) {
    this.charset = charset;
    return this;
  }

  @Override
  public List parse(File file, HCatSchema schema, List names) {
    try {
      List lines = Files.readAllLines(file.toPath(), charset);
      List records = new ArrayList<>(lines.size());
      for (String line : lines) {
        records.add(parseRow(line, names.size()));
      }
      return records;
    } catch (IOException e) {
      throw new RuntimeException("Error while reading file", e);
    }
  }

  private Object[] parseRow(String line, int size) {
    List row = new ArrayList<>(size);
    Iterator iterator = splitter.split(line).iterator();

    for (int i = 0; i < size; i++) {
      if (iterator.hasNext()) {
        String column = iterator.next();
        if (ObjectUtils.equals(nullValue, column)) {
          row.add(null);
        } else {
          row.add(column);
        }
      } else {
        throw new IllegalStateException("Not enough columns. Require " + size + " columns, got " + i);
      }
    }

    return row.toArray(new Object[size]);
  }
}