
org.apache.parquet.cli.util.Schemas Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.cli.util;
import com.fasterxml.jackson.databind.node.NullNode;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.cli.json.AvroJson;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
public class Schemas {
public static Schema fromAvsc(InputStream in) throws IOException {
// the parser has state, so use a new one each time
return new Schema.Parser().parse(in);
}
public static Schema fromAvro(InputStream in) throws IOException {
GenericDatumReader datumReader = new GenericDatumReader();
DataFileStream stream = null;
boolean threw = true;
try {
stream = new DataFileStream<>(in, datumReader);
Schema schema = stream.getSchema();
threw = false;
return schema;
} finally {
Closeables.close(stream, threw);
}
}
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
Path path = new Path(location);
FileSystem fs = path.getFileSystem(conf);
ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);
String schemaString = footer.getFileMetaData().getKeyValueMetaData().get("parquet.avro.schema");
if (schemaString == null) {
// try the older property
schemaString = footer.getFileMetaData().getKeyValueMetaData().get("avro.schema");
}
if (schemaString != null) {
return new Schema.Parser().parse(schemaString);
} else {
return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
}
}
public static Schema fromJSON(String name, InputStream in) throws IOException {
return AvroJson.inferSchema(in, name, 20);
}
/**
* Returns whether null is allowed by the schema.
*
* @param schema a Schema
* @return true if schema allows the value to be null
*/
public static boolean nullOk(Schema schema) {
if (Schema.Type.NULL == schema.getType()) {
return true;
} else if (Schema.Type.UNION == schema.getType()) {
for (Schema possible : schema.getTypes()) {
if (nullOk(possible)) {
return true;
}
}
}
return false;
}
/**
* Merges {@link Schema} instances if they are compatible.
*
* Schemas are incompatible if:
*
* - The {@link Schema.Type} does not match.
* - For record schemas, the record name does not match
* - For enum schemas, the enum name does not match
*
*
* Map value, array element, and record field types types will use unions if
* necessary, and union schemas are merged recursively.
*
* @param schemas a set of {@code Schema} instances to merge
* @return a merged {@code Schema}
* @throws IllegalStateException if the schemas are not compatible
*/
public static Schema merge(Iterable schemas) {
Iterator iter = schemas.iterator();
if (!iter.hasNext()) {
return null;
}
Schema result = iter.next();
while (iter.hasNext()) {
result = merge(result, iter.next());
}
return result;
}
/**
* Merges {@link Schema} instances and creates a union of schemas if any are
* incompatible.
*
* Schemas are incompatible if:
*
* - The {@link Schema.Type} does not match.
* - For record schemas, the record name does not match
* - For enum schemas, the enum name does not match
*
*
* Map value, array element, and record field types types will use unions if
* necessary, and union schemas are merged recursively.
*
* @param schemas a set of {@code Schema} instances to merge
* @return a combined {@code Schema}
*/
public static Schema mergeOrUnion(Iterable schemas) {
Iterator iter = schemas.iterator();
if (!iter.hasNext()) {
return null;
}
Schema result = iter.next();
while (iter.hasNext()) {
result = mergeOrUnion(result, iter.next());
}
return result;
}
/**
* Merges two {@link Schema} instances if they are compatible.
*
* Two schemas are incompatible if:
*
* - The {@link Schema.Type} does not match.
* - For record schemas, the record name does not match
* - For enum schemas, the enum name does not match
*
*
* Map value and array element types will use unions if necessary, and union
* schemas are merged recursively.
*
* @param left a {@code Schema}
* @param right a {@code Schema}
* @return a merged {@code Schema}
* @throws IllegalStateException if the schemas are not compatible
*/
public static Schema merge(Schema left, Schema right) {
Schema merged = mergeOnly(left, right);
Preconditions.checkState(merged != null, "Cannot merge %s and %s", left, right);
return merged;
}
/**
* Merges two {@link Schema} instances or returns {@code null}.
*
* The two schemas are merged if they are the same type. Records are merged
* if the two records have the same name or have no names but have a
* significant number of shared fields.
*
*
* @param left a {@code Schema}
* @param right a {@code Schema}
* @return a {@code Schema} for both types
* @see {@link #mergeOrUnion} to return a union when a merge is not possible.
*/
private static Schema mergeOrUnion(Schema left, Schema right) {
Schema merged = mergeOnly(left, right);
if (merged != null) {
return merged;
}
return union(left, right);
}
/**
* Creates a union of two {@link Schema} instances.
*
* If either {@code Schema} is a union, this will attempt to merge the other
* schema with the types contained in that union before adding more types to
* the union that is produced.
*
* If both schemas are not unions, no merge is attempted.
*
* @param left a {@code Schema}
* @param right a {@code Schema}
* @return a UNION schema of the to {@code Schema} instances
*/
private static Schema union(Schema left, Schema right) {
if (left.getType() == Schema.Type.UNION) {
if (right.getType() == Schema.Type.UNION) {
// combine the unions by adding each type in right individually
Schema combined = left;
for (Schema type : right.getTypes()) {
combined = union(combined, type);
}
return combined;
} else {
boolean notMerged = true;
// combine a union with a non-union by checking if each type will merge
List types = Lists.newArrayList();
Iterator schemas = left.getTypes().iterator();
// try to merge each type and stop when one succeeds
while (schemas.hasNext()) {
Schema next = schemas.next();
Schema merged = mergeOnly(next, right);
if (merged != null) {
types.add(merged);
notMerged = false;
break;
} else {
// merge didn't work, add the type
types.add(next);
}
}
// add the remaining types from the left union
while (schemas.hasNext()) {
types.add(schemas.next());
}
if (notMerged) {
types.add(right);
}
return Schema.createUnion(types);
}
} else if (right.getType() == Schema.Type.UNION) {
return union(right, left);
}
return Schema.createUnion(ImmutableList.of(left, right));
}
/**
* Merges two {@link Schema} instances or returns {@code null}.
*
* The two schemas are merged if they are the same type. Records are merged
* if the two records have the same name or have no names but have a
* significant number of shared fields.
*
*
* @param left a {@code Schema}
* @param right a {@code Schema}
* @return a merged {@code Schema} or {@code null} if merging is not possible
* @see {@link #mergeOrUnion} to return a union when a merge is not possible.
*/
private static Schema mergeOnly(Schema left, Schema right) {
if (Objects.equal(left, right)) {
return left;
}
// handle primitive type promotion; doesn't promote integers to floats
switch (left.getType()) {
case INT:
if (right.getType() == Schema.Type.LONG) {
return right;
}
break;
case LONG:
if (right.getType() == Schema.Type.INT) {
return left;
}
break;
case FLOAT:
if (right.getType() == Schema.Type.DOUBLE) {
return right;
}
break;
case DOUBLE:
if (right.getType() == Schema.Type.FLOAT) {
return left;
}
}
// any other cases where the types don't match must be combined by a union
if (left.getType() != right.getType()) {
return null;
}
switch (left.getType()) {
case UNION:
return union(left, right);
case RECORD:
if (left.getName() == null
&& right.getName() == null
&& fieldSimilarity(left, right) < SIMILARITY_THRESH) {
return null;
} else if (!Objects.equal(left.getName(), right.getName())) {
return null;
}
Schema combinedRecord = Schema.createRecord(
coalesce(left.getName(), right.getName()),
coalesce(left.getDoc(), right.getDoc()),
coalesce(left.getNamespace(), right.getNamespace()),
false);
combinedRecord.setFields(mergeFields(left, right));
return combinedRecord;
case MAP:
return Schema.createMap(mergeOrUnion(left.getValueType(), right.getValueType()));
case ARRAY:
return Schema.createArray(mergeOrUnion(left.getElementType(), right.getElementType()));
case ENUM:
if (!Objects.equal(left.getName(), right.getName())) {
return null;
}
Set symbols = Sets.newLinkedHashSet();
symbols.addAll(left.getEnumSymbols());
symbols.addAll(right.getEnumSymbols());
return Schema.createEnum(
left.getName(),
coalesce(left.getDoc(), right.getDoc()),
coalesce(left.getNamespace(), right.getNamespace()),
ImmutableList.copyOf(symbols));
default:
// all primitives are handled before the switch by the equality check.
// schemas that reach this point are not primitives and also not any of
// the above known types.
throw new UnsupportedOperationException("Unknown schema type: " + left.getType());
}
}
private static final Schema NULL = Schema.create(Schema.Type.NULL);
private static final NullNode NULL_DEFAULT = NullNode.getInstance();
/**
* Returns a union {@link Schema} of NULL and the given {@code schema}.
*
* A NULL schema is always the first type in the union so that a null default
* value can be set.
*
* @param schema a {@code Schema}
* @return a union of null and the given schema
*/
private static Schema nullableForDefault(Schema schema) {
if (schema.getType() == Schema.Type.NULL) {
return schema;
}
if (schema.getType() != Schema.Type.UNION) {
return Schema.createUnion(ImmutableList.of(NULL, schema));
}
if (schema.getTypes().get(0).getType() == Schema.Type.NULL) {
return schema;
}
List types = Lists.newArrayList();
types.add(NULL);
for (Schema type : schema.getTypes()) {
if (type.getType() != Schema.Type.NULL) {
types.add(type);
}
}
return Schema.createUnion(types);
}
private static List mergeFields(Schema left, Schema right) {
List fields = Lists.newArrayList();
for (Schema.Field leftField : left.getFields()) {
Schema.Field rightField = right.getField(leftField.name());
if (rightField != null) {
fields.add(new Schema.Field(
leftField.name(),
mergeOrUnion(leftField.schema(), rightField.schema()),
coalesce(leftField.doc(), rightField.doc()),
coalesce(leftField.defaultVal(), rightField.defaultVal())));
} else {
if (leftField.defaultVal() != null) {
fields.add(copy(leftField));
} else {
fields.add(new Schema.Field(
leftField.name(), nullableForDefault(leftField.schema()), leftField.doc(), NULL_DEFAULT));
}
}
}
for (Schema.Field rightField : right.getFields()) {
if (left.getField(rightField.name()) == null) {
if (rightField.defaultVal() != null) {
fields.add(copy(rightField));
} else {
fields.add(new Schema.Field(
rightField.name(),
nullableForDefault(rightField.schema()),
rightField.doc(),
NULL_DEFAULT));
}
}
}
return fields;
}
/**
* Creates a new field with the same name, schema, doc, and default value as
* the incoming schema.
*
* Fields cannot be used in more than one record (not Immutable?).
*
* @param field an Avro schema field
* @return a copy of the field
*/
public static Schema.Field copy(Schema.Field field) {
return new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal());
}
private static float fieldSimilarity(Schema left, Schema right) {
// check whether the unnamed records appear to be the same record
Set leftNames = names(left.getFields());
Set rightNames = names(right.getFields());
int common = Sets.intersection(leftNames, rightNames).size();
float leftRatio = ((float) common) / ((float) leftNames.size());
float rightRatio = ((float) common) / ((float) rightNames.size());
return hmean(leftRatio, rightRatio);
}
private static Set names(Collection fields) {
Set names = Sets.newHashSet();
for (Schema.Field field : fields) {
names.add(field.name());
}
return names;
}
private static float SIMILARITY_THRESH = 0.3f;
private static float hmean(float left, float right) {
return (2.0f * left * right) / (left + right);
}
/**
* Returns the first non-null object that is passed in.
*/
@SafeVarargs
private static E coalesce(E... objects) {
for (E object : objects) {
if (object != null) {
return object;
}
}
return null;
}
}