
org.apache.gobblin.converter.parquet.JsonElementConversionFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gobblin-parquet Show documentation
Show all versions of gobblin-parquet Show documentation
A distributed data integration framework for streaming and batch data ecosystems.
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.converter.parquet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.BinaryValue;
import org.apache.parquet.example.data.simple.BooleanValue;
import org.apache.parquet.example.data.simple.DoubleValue;
import org.apache.parquet.example.data.simple.FloatValue;
import org.apache.parquet.example.data.simple.IntegerValue;
import org.apache.parquet.example.data.simple.LongValue;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import org.apache.gobblin.converter.parquet.JsonSchema.*;
import static org.apache.gobblin.converter.parquet.JsonElementConversionFactory.RecordConverter.RecordType.CHILD;
import static org.apache.gobblin.converter.parquet.JsonSchema.*;
import static org.apache.gobblin.converter.parquet.JsonSchema.InputType.STRING;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
/**
*
* Creates a JsonElement to Parquet converter for all supported data types.
*
*
* @author tilakpatidar
*
*/
public class JsonElementConversionFactory {
/**
* Use to create a converter for a single field from a parquetSchema.
*
* @param schema
* @param repeated - Is the {@link Type} repeated in the parent {@link Group}
* @return
*/
public static JsonElementConverter getConverter(JsonSchema schema, boolean repeated) {
InputType fieldType = schema.getInputType();
switch (fieldType) {
case INT:
return new IntConverter(schema, repeated);
case LONG:
return new LongConverter(schema, repeated);
case FLOAT:
return new FloatConverter(schema, repeated);
case DOUBLE:
return new DoubleConverter(schema, repeated);
case BOOLEAN:
return new BooleanConverter(schema, repeated);
case STRING:
return new StringConverter(schema, repeated);
case ARRAY:
return new ArrayConverter(schema);
case ENUM:
return new EnumConverter(schema);
case RECORD:
return new RecordConverter(schema);
case MAP:
return new MapConverter(schema);
case DATE:
case TIMESTAMP:
return new StringConverter(schema, repeated);
default:
throw new UnsupportedOperationException(fieldType + " is unsupported");
}
}
/**
* Converts a JsonElement into a supported ParquetType
* @author tilakpatidar
*
*/
public static abstract class JsonElementConverter {
protected final JsonSchema jsonSchema;
protected JsonElementConverter(JsonSchema schema) {
this.jsonSchema = schema;
}
/**
* Convert value to a parquet type and perform null check.
* @param value
* @return Parquet safe type
*/
public Object convert(JsonElement value) {
if (value.isJsonNull()) {
if (this.jsonSchema.isNullable()) {
return null;
}
throw new RuntimeException(
"Field: " + this.jsonSchema.getColumnName() + " is not nullable and contains a null value");
}
return convertField(value);
}
/**
* Returns a {@link Type} parquet schema
* @return
*/
abstract public Type schema();
/**
* Convert JsonElement to Parquet type
* @param value
* @return
*/
abstract Object convertField(JsonElement value);
}
/**
* Converts a {@link JsonSchema} to a {@link PrimitiveType}
*/
public static abstract class PrimitiveConverter extends JsonElementConverter {
protected final boolean repeated;
private PrimitiveTypeName outputType;
protected Type schema;
/**
* @param jsonSchema
* @param repeated
* @param outputType
*/
public PrimitiveConverter(JsonSchema jsonSchema, boolean repeated, PrimitiveTypeName outputType) {
super(jsonSchema);
this.repeated = repeated;
this.outputType = outputType;
this.schema = buildSchema();
}
protected Type buildSchema() {
return new PrimitiveType(this.repeated ? REPEATED : optionalOrRequired(this.jsonSchema), this.outputType,
this.jsonSchema.getColumnName());
}
@Override
public Type schema() {
return this.schema;
}
}
/**
* Converts {@link JsonSchema} having collection of elements of {@link InputType} into a {@link GroupType}.
*/
public static abstract class CollectionConverter extends JsonElementConverter {
protected InputType elementType;
protected JsonElementConverter elementConverter;
protected Type schema;
public CollectionConverter(JsonSchema collectionSchema, InputType elementType, boolean repeated) {
super(collectionSchema);
this.elementType = elementType;
this.elementConverter = getConverter(getElementSchema(), repeated);
this.schema = buildSchema();
}
@Override
public Type schema() {
return this.schema;
}
/**
* Prepare a {@link JsonSchema} for the elements in a collection.
* @return
*/
abstract JsonSchema getElementSchema();
abstract Type buildSchema();
}
public static class IntConverter extends PrimitiveConverter {
public IntConverter(JsonSchema schema, boolean repeated) {
super(schema, repeated, INT32);
}
@Override
IntegerValue convertField(JsonElement value) {
return new IntegerValue(value.getAsInt());
}
}
public static class LongConverter extends PrimitiveConverter {
public LongConverter(JsonSchema schema, boolean repeated) {
super(schema, repeated, INT64);
}
@Override
LongValue convertField(JsonElement value) {
return new LongValue(value.getAsLong());
}
}
public static class FloatConverter extends PrimitiveConverter {
public FloatConverter(JsonSchema schema, boolean repeated) {
super(schema, repeated, PrimitiveTypeName.FLOAT);
}
@Override
FloatValue convertField(JsonElement value) {
return new FloatValue(value.getAsFloat());
}
}
public static class DoubleConverter extends PrimitiveConverter {
public DoubleConverter(JsonSchema schema, boolean repeated) {
super(schema, repeated, PrimitiveTypeName.DOUBLE);
}
@Override
DoubleValue convertField(JsonElement value) {
return new DoubleValue(value.getAsDouble());
}
}
public static class BooleanConverter extends PrimitiveConverter {
public BooleanConverter(JsonSchema schema, boolean repeated) {
super(schema, repeated, PrimitiveTypeName.BOOLEAN);
}
@Override
BooleanValue convertField(JsonElement value) {
return new BooleanValue(value.getAsBoolean());
}
}
public static class StringConverter extends PrimitiveConverter {
public StringConverter(JsonSchema schema, boolean repeated) {
super(schema, repeated, BINARY);
this.schema = buildSchema();
}
@Override
BinaryValue convertField(JsonElement value) {
return new BinaryValue(Binary.fromString(value.getAsString()));
}
@Override
protected Type buildSchema() {
String columnName = this.jsonSchema.getColumnName();
if (this.repeated) {
return Types.repeated(BINARY).as(LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()).named(columnName);
}
switch (optionalOrRequired(this.jsonSchema)) {
case OPTIONAL:
return Types.optional(BINARY).as(LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()).named(columnName);
case REQUIRED:
return Types.required(BINARY).as(LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()).named(columnName);
default:
throw new RuntimeException("Unsupported Repetition type");
}
}
}
public static Type.Repetition optionalOrRequired(JsonSchema jsonBaseSchema) {
return jsonBaseSchema.isNullable() ? OPTIONAL : REQUIRED;
}
public static class ArrayConverter extends CollectionConverter {
public ArrayConverter(JsonSchema arraySchema) {
super(arraySchema, arraySchema.getElementTypeUsingKey(ARRAY_ITEMS_KEY), true);
}
@Override
Object convertField(JsonElement value) {
ParquetGroup array = new ParquetGroup((GroupType) schema());
JsonElementConverter converter = this.elementConverter;
for (JsonElement elem : (JsonArray) value) {
array.add(ARRAY_KEY, converter.convert(elem));
}
return array;
}
@Override
protected Type buildSchema() {
List fields = new ArrayList<>();
fields.add(0, this.elementConverter.schema());
return new GroupType(optionalOrRequired(jsonSchema), this.jsonSchema.getColumnName(), fields);
}
@Override
JsonSchema getElementSchema() {
JsonSchema jsonSchema = JsonSchema.buildBaseSchema(this.elementType, true);
jsonSchema.setColumnName(ARRAY_KEY);
return jsonSchema;
}
}
public static class EnumConverter extends CollectionConverter {
private final HashSet symbols = new HashSet<>();
public EnumConverter(JsonSchema enumSchema) {
super(enumSchema, STRING, false);
JsonArray symbolsArray = enumSchema.getSymbols();
symbolsArray.forEach(e -> symbols.add(e.getAsString()));
}
@Override
Object convertField(JsonElement value) {
if (symbols.contains(value.getAsString()) || (this.jsonSchema.isNullable() && value.isJsonNull())) {
return this.elementConverter.convert(value);
}
throw new RuntimeException("Symbol " + value.getAsString() + " does not belong to set " + symbols.toString());
}
@Override
protected Type buildSchema() {
return this.elementConverter.schema();
}
@Override
JsonSchema getElementSchema() {
JsonSchema jsonSchema = JsonSchema.buildBaseSchema(STRING, this.jsonSchema.isNullable());
jsonSchema.setColumnName(this.jsonSchema.getColumnName());
return jsonSchema;
}
}
public static class RecordConverter extends JsonElementConverter {
private final HashMap converters;
private final RecordType recordType;
private final Type schema;
public enum RecordType {
ROOT, CHILD
}
public RecordConverter(JsonSchema recordSchema) {
this(recordSchema, CHILD);
}
public RecordConverter(JsonSchema recordSchema, RecordType recordType) {
super(recordSchema);
this.converters = new HashMap<>();
this.recordType = recordType;
this.schema = buildSchema();
}
@Override
Object convertField(JsonElement value) {
ParquetGroup r1 = new ParquetGroup((GroupType) schema());
JsonObject inputRecord = value.getAsJsonObject();
for (Map.Entry entry : inputRecord.entrySet()) {
String key = entry.getKey();
JsonElementConverter converter = this.converters.get(key);
Object convertedValue = converter.convert(entry.getValue());
boolean valueIsNull = convertedValue == null;
Type.Repetition repetition = optionalOrRequired(converter.jsonSchema);
if (valueIsNull && repetition.equals(OPTIONAL)) {
continue;
}
r1.add(key, convertedValue);
}
return r1;
}
private Type buildSchema() {
JsonArray inputSchema = this.jsonSchema.getDataTypeValues();
List parquetTypes = new ArrayList<>();
for (JsonElement element : inputSchema) {
JsonObject map = (JsonObject) element;
JsonSchema elementSchema = new JsonSchema(map);
String columnName = elementSchema.getColumnName();
JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false);
Type schemaType = converter.schema();
this.converters.put(columnName, converter);
parquetTypes.add(schemaType);
}
String docName = this.jsonSchema.getColumnName();
switch (recordType) {
case ROOT:
return new MessageType(docName, parquetTypes);
case CHILD:
return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes);
default:
throw new RuntimeException("Unsupported Record type");
}
}
@Override
public Type schema() {
return this.schema;
}
}
public static class MapConverter extends CollectionConverter {
public MapConverter(JsonSchema mapSchema) {
super(mapSchema, mapSchema.getElementTypeUsingKey(MAP_ITEMS_KEY), false);
}
@Override
Object convertField(JsonElement value) {
ParquetGroup mapGroup = new ParquetGroup((GroupType) schema());
JsonElementConverter converter = this.elementConverter;
JsonObject map = (JsonObject) value;
for (Map.Entry entry : map.entrySet()) {
ParquetGroup entrySet = (ParquetGroup) mapGroup.addGroup(MAP_KEY);
entrySet.add(MAP_KEY_COLUMN_NAME, entry.getKey());
entrySet.add(MAP_VALUE_COLUMN_NAME, converter.convert(entry.getValue()));
}
return mapGroup;
}
@Override
protected Type buildSchema() {
JsonElementConverter elementConverter = this.elementConverter;
JsonElementConverter keyConverter = getKeyConverter();
GroupType mapGroup =
Types.repeatedGroup().addFields(keyConverter.schema(), elementConverter.schema()).named(MAP_KEY)
.asGroupType();
String columnName = this.jsonSchema.getColumnName();
switch (optionalOrRequired(this.jsonSchema)) {
case OPTIONAL:
return Types.optionalGroup().addFields(mapGroup).named(columnName).asGroupType();
case REQUIRED:
return Types.requiredGroup().addFields(mapGroup).named(columnName).asGroupType();
default:
return null;
}
}
@Override
JsonSchema getElementSchema() {
JsonSchema jsonSchema = JsonSchema.buildBaseSchema(this.elementType, false);
jsonSchema.setColumnName(MAP_VALUE_COLUMN_NAME);
return jsonSchema;
}
public JsonElementConverter getKeyConverter() {
JsonSchema jsonSchema = JsonSchema.buildBaseSchema(STRING, false);
jsonSchema.setColumnName(MAP_KEY_COLUMN_NAME);
return getConverter(jsonSchema, false);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy