All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.util.Schemas Maven / Gradle / Ivy

There is a newer version: 1.15.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli.util;

import com.fasterxml.jackson.databind.node.NullNode;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.cli.json.AvroJson;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;

public class Schemas {

  public static Schema fromAvsc(InputStream in) throws IOException {
    // the parser has state, so use a new one each time
    return new Schema.Parser().parse(in);
  }

  public static Schema fromAvro(InputStream in) throws IOException {
    GenericDatumReader datumReader = new GenericDatumReader();
    DataFileStream stream = null;
    boolean threw = true;

    try {
      stream = new DataFileStream<>(in, datumReader);
      Schema schema = stream.getSchema();
      threw = false;
      return schema;
    } finally {
      Closeables.close(stream, threw);
    }
  }

  public static Schema fromParquet(Configuration conf, URI location) throws IOException {
    Path path = new Path(location);
    FileSystem fs = path.getFileSystem(conf);

    ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

    String schemaString = footer.getFileMetaData().getKeyValueMetaData().get("parquet.avro.schema");
    if (schemaString == null) {
      // try the older property
      schemaString = footer.getFileMetaData().getKeyValueMetaData().get("avro.schema");
    }

    if (schemaString != null) {
      return new Schema.Parser().parse(schemaString);
    } else {
      return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
    }
  }

  public static Schema fromJSON(String name, InputStream in) throws IOException {
    return AvroJson.inferSchema(in, name, 20);
  }

  /**
   * Returns whether null is allowed by the schema.
   *
   * @param schema a Schema
   * @return true if schema allows the value to be null
   */
  public static boolean nullOk(Schema schema) {
    if (Schema.Type.NULL == schema.getType()) {
      return true;
    } else if (Schema.Type.UNION == schema.getType()) {
      for (Schema possible : schema.getTypes()) {
        if (nullOk(possible)) {
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Merges {@link Schema} instances if they are compatible.
   * 

* Schemas are incompatible if: *

    *
  • The {@link Schema.Type} does not match.
  • *
  • For record schemas, the record name does not match
  • *
  • For enum schemas, the enum name does not match
  • *
*

* Map value, array element, and record field types types will use unions if * necessary, and union schemas are merged recursively. * * @param schemas a set of {@code Schema} instances to merge * @return a merged {@code Schema} * @throws IllegalStateException if the schemas are not compatible */ public static Schema merge(Iterable schemas) { Iterator iter = schemas.iterator(); if (!iter.hasNext()) { return null; } Schema result = iter.next(); while (iter.hasNext()) { result = merge(result, iter.next()); } return result; } /** * Merges {@link Schema} instances and creates a union of schemas if any are * incompatible. *

* Schemas are incompatible if: *

    *
  • The {@link Schema.Type} does not match.
  • *
  • For record schemas, the record name does not match
  • *
  • For enum schemas, the enum name does not match
  • *
*

* Map value, array element, and record field types types will use unions if * necessary, and union schemas are merged recursively. * * @param schemas a set of {@code Schema} instances to merge * @return a combined {@code Schema} */ public static Schema mergeOrUnion(Iterable schemas) { Iterator iter = schemas.iterator(); if (!iter.hasNext()) { return null; } Schema result = iter.next(); while (iter.hasNext()) { result = mergeOrUnion(result, iter.next()); } return result; } /** * Merges two {@link Schema} instances if they are compatible. *

* Two schemas are incompatible if: *

    *
  • The {@link Schema.Type} does not match.
  • *
  • For record schemas, the record name does not match
  • *
  • For enum schemas, the enum name does not match
  • *
*

* Map value and array element types will use unions if necessary, and union * schemas are merged recursively. * * @param left a {@code Schema} * @param right a {@code Schema} * @return a merged {@code Schema} * @throws IllegalStateException if the schemas are not compatible */ public static Schema merge(Schema left, Schema right) { Schema merged = mergeOnly(left, right); Preconditions.checkState(merged != null, "Cannot merge %s and %s", left, right); return merged; } /** * Merges two {@link Schema} instances or returns {@code null}. *

* The two schemas are merged if they are the same type. Records are merged * if the two records have the same name or have no names but have a * significant number of shared fields. *

* * @param left a {@code Schema} * @param right a {@code Schema} * @return a {@code Schema} for both types * @see {@link #mergeOrUnion} to return a union when a merge is not possible. */ private static Schema mergeOrUnion(Schema left, Schema right) { Schema merged = mergeOnly(left, right); if (merged != null) { return merged; } return union(left, right); } /** * Creates a union of two {@link Schema} instances. *

* If either {@code Schema} is a union, this will attempt to merge the other * schema with the types contained in that union before adding more types to * the union that is produced. *

* If both schemas are not unions, no merge is attempted. * * @param left a {@code Schema} * @param right a {@code Schema} * @return a UNION schema of the to {@code Schema} instances */ private static Schema union(Schema left, Schema right) { if (left.getType() == Schema.Type.UNION) { if (right.getType() == Schema.Type.UNION) { // combine the unions by adding each type in right individually Schema combined = left; for (Schema type : right.getTypes()) { combined = union(combined, type); } return combined; } else { boolean notMerged = true; // combine a union with a non-union by checking if each type will merge List types = Lists.newArrayList(); Iterator schemas = left.getTypes().iterator(); // try to merge each type and stop when one succeeds while (schemas.hasNext()) { Schema next = schemas.next(); Schema merged = mergeOnly(next, right); if (merged != null) { types.add(merged); notMerged = false; break; } else { // merge didn't work, add the type types.add(next); } } // add the remaining types from the left union while (schemas.hasNext()) { types.add(schemas.next()); } if (notMerged) { types.add(right); } return Schema.createUnion(types); } } else if (right.getType() == Schema.Type.UNION) { return union(right, left); } return Schema.createUnion(ImmutableList.of(left, right)); } /** * Merges two {@link Schema} instances or returns {@code null}. *

* The two schemas are merged if they are the same type. Records are merged * if the two records have the same name or have no names but have a * significant number of shared fields. *

* * @param left a {@code Schema} * @param right a {@code Schema} * @return a merged {@code Schema} or {@code null} if merging is not possible * @see {@link #mergeOrUnion} to return a union when a merge is not possible. */ private static Schema mergeOnly(Schema left, Schema right) { if (Objects.equal(left, right)) { return left; } // handle primitive type promotion; doesn't promote integers to floats switch (left.getType()) { case INT: if (right.getType() == Schema.Type.LONG) { return right; } break; case LONG: if (right.getType() == Schema.Type.INT) { return left; } break; case FLOAT: if (right.getType() == Schema.Type.DOUBLE) { return right; } break; case DOUBLE: if (right.getType() == Schema.Type.FLOAT) { return left; } } // any other cases where the types don't match must be combined by a union if (left.getType() != right.getType()) { return null; } switch (left.getType()) { case UNION: return union(left, right); case RECORD: if (left.getName() == null && right.getName() == null && fieldSimilarity(left, right) < SIMILARITY_THRESH) { return null; } else if (!Objects.equal(left.getName(), right.getName())) { return null; } Schema combinedRecord = Schema.createRecord( coalesce(left.getName(), right.getName()), coalesce(left.getDoc(), right.getDoc()), coalesce(left.getNamespace(), right.getNamespace()), false); combinedRecord.setFields(mergeFields(left, right)); return combinedRecord; case MAP: return Schema.createMap(mergeOrUnion(left.getValueType(), right.getValueType())); case ARRAY: return Schema.createArray(mergeOrUnion(left.getElementType(), right.getElementType())); case ENUM: if (!Objects.equal(left.getName(), right.getName())) { return null; } Set symbols = Sets.newLinkedHashSet(); symbols.addAll(left.getEnumSymbols()); symbols.addAll(right.getEnumSymbols()); return Schema.createEnum( left.getName(), coalesce(left.getDoc(), right.getDoc()), coalesce(left.getNamespace(), right.getNamespace()), ImmutableList.copyOf(symbols)); default: // all primitives are handled before the switch by the equality check. // schemas that reach this point are not primitives and also not any of // the above known types. throw new UnsupportedOperationException("Unknown schema type: " + left.getType()); } } private static final Schema NULL = Schema.create(Schema.Type.NULL); private static final NullNode NULL_DEFAULT = NullNode.getInstance(); /** * Returns a union {@link Schema} of NULL and the given {@code schema}. *

* A NULL schema is always the first type in the union so that a null default * value can be set. * * @param schema a {@code Schema} * @return a union of null and the given schema */ private static Schema nullableForDefault(Schema schema) { if (schema.getType() == Schema.Type.NULL) { return schema; } if (schema.getType() != Schema.Type.UNION) { return Schema.createUnion(ImmutableList.of(NULL, schema)); } if (schema.getTypes().get(0).getType() == Schema.Type.NULL) { return schema; } List types = Lists.newArrayList(); types.add(NULL); for (Schema type : schema.getTypes()) { if (type.getType() != Schema.Type.NULL) { types.add(type); } } return Schema.createUnion(types); } private static List mergeFields(Schema left, Schema right) { List fields = Lists.newArrayList(); for (Schema.Field leftField : left.getFields()) { Schema.Field rightField = right.getField(leftField.name()); if (rightField != null) { fields.add(new Schema.Field( leftField.name(), mergeOrUnion(leftField.schema(), rightField.schema()), coalesce(leftField.doc(), rightField.doc()), coalesce(leftField.defaultVal(), rightField.defaultVal()))); } else { if (leftField.defaultVal() != null) { fields.add(copy(leftField)); } else { fields.add(new Schema.Field( leftField.name(), nullableForDefault(leftField.schema()), leftField.doc(), NULL_DEFAULT)); } } } for (Schema.Field rightField : right.getFields()) { if (left.getField(rightField.name()) == null) { if (rightField.defaultVal() != null) { fields.add(copy(rightField)); } else { fields.add(new Schema.Field( rightField.name(), nullableForDefault(rightField.schema()), rightField.doc(), NULL_DEFAULT)); } } } return fields; } /** * Creates a new field with the same name, schema, doc, and default value as * the incoming schema. *

* Fields cannot be used in more than one record (not Immutable?). * * @param field an Avro schema field * @return a copy of the field */ public static Schema.Field copy(Schema.Field field) { return new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()); } private static float fieldSimilarity(Schema left, Schema right) { // check whether the unnamed records appear to be the same record Set leftNames = names(left.getFields()); Set rightNames = names(right.getFields()); int common = Sets.intersection(leftNames, rightNames).size(); float leftRatio = ((float) common) / ((float) leftNames.size()); float rightRatio = ((float) common) / ((float) rightNames.size()); return hmean(leftRatio, rightRatio); } private static Set names(Collection fields) { Set names = Sets.newHashSet(); for (Schema.Field field : fields) { names.add(field.name()); } return names; } private static float SIMILARITY_THRESH = 0.3f; private static float hmean(float left, float right) { return (2.0f * left * right) / (left + right); } /** * Returns the first non-null object that is passed in. */ @SafeVarargs private static E coalesce(E... objects) { for (E object : objects) { if (object != null) { return object; } } return null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy