All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.csv.AvroCSV Maven / Gradle / Ivy

There is a newer version: 1.15.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli.csv;

import static java.lang.Math.min;

import au.com.bytecode.opencsv.CSVParser;
import au.com.bytecode.opencsv.CSVReader;
import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;

public class AvroCSV {

  private static final Pattern LONG = Pattern.compile("\\d+");
  private static final Pattern DOUBLE = Pattern.compile("\\d*\\.\\d*[dD]?");
  private static final Pattern FLOAT = Pattern.compile("\\d*\\.\\d*[fF]?");
  private static final int DEFAULT_INFER_LINES = 25;
  private static final Set NO_REQUIRED_FIELDS = ImmutableSet.of();
  // As per the Avro specs mentioned here -http://avro.apache.org/docs/1.7.5/spec.html
  // It should start with [A-Za-z_] and subsequently contain only [A-Za-z0-9_]
  private static final Pattern AVRO_COMPATIBLE = Pattern.compile("^[A-Za-z_][A-Za-z\\d_]*$");

  static CSVReader newReader(InputStream incoming, CSVProperties props) {
    return new CSVReader(
        new InputStreamReader(incoming, Charset.forName(props.charset)),
        props.delimiter.charAt(0),
        props.quote.charAt(0),
        props.escape.charAt(0),
        props.linesToSkip,
        false /* strict quotes off: don't ignore unquoted strings */,
        true /* ignore leading white-space */);
  }

  static CSVParser newParser(CSVProperties props) {
    return new CSVParser(
        props.delimiter.charAt(0),
        props.quote.charAt(0),
        props.escape.charAt(0),
        false /* strict quotes off: don't ignore unquoted strings */,
        true /* ignore leading white-space */);
  }

  public static Schema inferNullableSchema(String name, InputStream incoming, CSVProperties props)
      throws IOException {
    return inferSchemaInternal(name, incoming, props, NO_REQUIRED_FIELDS, true);
  }

  public static Schema inferNullableSchema(
      String name, InputStream incoming, CSVProperties props, Set requiredFields) throws IOException {
    return inferSchemaInternal(name, incoming, props, requiredFields, true);
  }

  public static Schema inferSchema(String name, InputStream incoming, CSVProperties props) throws IOException {
    return inferSchemaInternal(name, incoming, props, NO_REQUIRED_FIELDS, false);
  }

  public static Schema inferSchema(String name, InputStream incoming, CSVProperties props, Set requiredFields)
      throws IOException {
    return inferSchemaInternal(name, incoming, props, requiredFields, false);
  }

  private static Schema inferSchemaInternal(
      String name, InputStream incoming, CSVProperties props, Set requiredFields, boolean makeNullable)
      throws IOException {
    CSVReader reader = newReader(incoming, props);

    String[] header;
    String[] line;
    if (props.useHeader) {
      // read the header and then the first line
      header = reader.readNext();
      line = reader.readNext();
      Objects.requireNonNull(line, "No content to infer schema");
    } else if (props.header != null) {
      header = newParser(props).parseLine(props.header);
      line = reader.readNext();
      Objects.requireNonNull(line, "No content to infer schema");
    } else {
      // use the first line to create a header
      line = reader.readNext();
      Objects.requireNonNull(line, "No content to infer schema");
      header = new String[line.length];
      for (int i = 0; i < line.length; i += 1) {
        header[i] = "field_" + i;
      }
    }

    Schema.Type[] types = new Schema.Type[header.length];
    String[] values = new String[header.length];
    boolean[] nullable = new boolean[header.length];
    boolean[] empty = new boolean[header.length];

    for (int processed = 0; processed < DEFAULT_INFER_LINES; processed += 1) {
      if (line == null) {
        break;
      }

      for (int i = 0; i < header.length; i += 1) {
        if (i < line.length) {
          if (types[i] == null) {
            types[i] = inferFieldType(line[i]);
            if (types[i] != null) {
              // keep track of the value used
              values[i] = line[i];
            }
          }

          if (line[i] == null) {
            nullable[i] = true;
          } else if (line[i].isEmpty()) {
            empty[i] = true;
          }
        } else {
          // no value results in null
          nullable[i] = true;
        }
      }

      line = reader.readNext();
    }

    SchemaBuilder.FieldAssembler fieldAssembler =
        SchemaBuilder.record(name).fields();

    // types may be missing, but fieldSchema will return a nullable string
    for (int i = 0; i < header.length; i += 1) {
      if (header[i] == null) {
        throw new RuntimeException("Bad header for field " + i + ": null");
      }

      String fieldName = header[i].trim();

      if (fieldName.isEmpty()) {
        throw new RuntimeException("Bad header for field " + i + ": \"" + fieldName + "\"");
      } else if (!isAvroCompatibleName(fieldName)) {
        throw new RuntimeException("Bad header for field, should start with a character "
            + "or _ and can contain only alphanumerics and _ "
            + i
            + ": \"" + fieldName + "\"");
      }

      // the empty string is not considered null for string fields
      boolean foundNull = (nullable[i] || (empty[i] && types[i] != Schema.Type.STRING));

      if (requiredFields.contains(fieldName)) {
        if (foundNull) {
          throw new RuntimeException(
              "Found null value for required field: " + fieldName + " (" + types[i] + ")");
        }
        fieldAssembler = fieldAssembler
            .name(fieldName)
            .doc("Type inferred from '" + sample(values[i]) + "'")
            .type(schema(types[i], false))
            .noDefault();
      } else {
        SchemaBuilder.GenericDefault defaultBuilder = fieldAssembler
            .name(fieldName)
            .doc("Type inferred from '" + sample(values[i]) + "'")
            .type(schema(types[i], makeNullable || foundNull));
        if (makeNullable || foundNull) {
          fieldAssembler = defaultBuilder.withDefault(null);
        } else {
          fieldAssembler = defaultBuilder.noDefault();
        }
      }
    }
    return fieldAssembler.endRecord();
  }

  private static final CharMatcher NON_PRINTABLE =
      CharMatcher.inRange('\u0020', '\u007e').negate();

  private static String sample(String value) {
    if (value != null) {
      return NON_PRINTABLE.replaceFrom(value.subSequence(0, min(50, value.length())), '.');
    } else {
      return "null";
    }
  }

  /**
   * Create a {@link Schema} for the given type. If the type is null,
   * the schema will be a nullable String. If isNullable is true, the returned
   * schema will be nullable.
   *
   * @param type         a {@link Schema.Type} compatible with {@code Schema.create}
   * @param makeNullable If {@code true}, the return type will be nullable
   * @return a {@code Schema} for the given {@code Schema.Type}
   * @see Schema#create(org.apache.avro.Schema.Type)
   */
  private static Schema schema(Schema.Type type, boolean makeNullable) {
    Schema schema = Schema.create(type == null ? Schema.Type.STRING : type);
    if (makeNullable || type == null) {
      schema = Schema.createUnion(Lists.newArrayList(Schema.create(Schema.Type.NULL), schema));
    }
    return schema;
  }

  private static Schema.Type inferFieldType(String example) {
    if (example == null || example.isEmpty()) {
      return null; // not enough information
    } else if (LONG.matcher(example).matches()) {
      return Schema.Type.LONG;
    } else if (DOUBLE.matcher(example).matches()) {
      return Schema.Type.DOUBLE;
    } else if (FLOAT.matcher(example).matches()) {
      return Schema.Type.FLOAT;
    }
    return Schema.Type.STRING;
  }

  /**
   * Returns true if the name does not contain characters that are known to be
   * incompatible with the specs defined in Avro schema.
   *
   * @param name a String field name to check
   * @return will return true if the name is Avro compatible ,false if not
   */
  private static boolean isAvroCompatibleName(String name) {
    return AVRO_COMPATIBLE.matcher(name).matches();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy