All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.formats.json.JsonRowSchemaConverter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.formats.json;

import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.types.Row;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.jackson.JacksonMapperFactory;

import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonParser;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.JsonNode;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

/**
 * Converts a JSON schema into Flink's type information. It uses {@link Row} for representing
 * objects and tuple arrays.
 *
 * 

Note: This converter implements just a subset of the JSON schema specification. Union types * (as well as "allOf", "anyOf", "not") are not supported yet. Simple references that link to a * common definition in the document are supported. "oneOf" and arrays of types are only supported * for specifying nullability. * *

This converter has been developed for JSON Schema draft-07 but also includes keywords of older * drafts to be as compatible as possible. */ public final class JsonRowSchemaConverter { private JsonRowSchemaConverter() { // private } // see https://spacetelescope.github.io/understanding-json-schema/UnderstandingJSONSchema.pdf private static final String PROPERTIES = "properties"; private static final String ADDITIONAL_PROPERTIES = "additionalProperties"; private static final String TYPE = "type"; private static final String FORMAT = "format"; private static final String CONTENT_ENCODING = "contentEncoding"; private static final String ITEMS = "items"; private static final String ADDITIONAL_ITEMS = "additionalItems"; private static final String REF = "$ref"; private static final String ALL_OF = "allOf"; private static final String ANY_OF = "anyOf"; private static final String NOT = "not"; private static final String ONE_OF = "oneOf"; // from https://tools.ietf.org/html/draft-zyp-json-schema-03#page-14 private static final String DISALLOW = "disallow"; private static final String EXTENDS = "extends"; private static final String TYPE_NULL = "null"; private static final String TYPE_BOOLEAN = "boolean"; private static final String TYPE_OBJECT = "object"; private static final String TYPE_ARRAY = "array"; private static final String TYPE_NUMBER = "number"; private static final String TYPE_INTEGER = "integer"; private static final String TYPE_STRING = "string"; private static final String FORMAT_DATE = "date"; private static final String FORMAT_TIME = "time"; private static final String FORMAT_DATE_TIME = "date-time"; private static final String CONTENT_ENCODING_BASE64 = "base64"; /** * Converts a JSON schema into Flink's type information. Throws an exception if the schema * cannot converted because of loss of precision or too flexible schema. * *

The converter can resolve simple schema references to solve those cases where entities are * defined at the beginning and then used throughout a document. */ @SuppressWarnings("unchecked") public static TypeInformation convert(String jsonSchema) { Preconditions.checkNotNull(jsonSchema, "JSON schema"); final ObjectMapper mapper = JacksonMapperFactory.createObjectMapper(); mapper.getFactory() .enable(JsonParser.Feature.ALLOW_COMMENTS) .enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES) .enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES); final JsonNode node; try { node = mapper.readTree(jsonSchema); } catch (IOException e) { throw new IllegalArgumentException("Invalid JSON schema.", e); } return (TypeInformation) convertType("", node, node); } private static TypeInformation convertType(String location, JsonNode node, JsonNode root) { // we use a set here to unify types (e.g. types that just add metadata such as 'multipleOf') final Set> typeSet = new HashSet<>(); // search for ref final Optional ref; if (node.has(REF) && node.get(REF).isTextual()) { // try a simple ref resolver to solve those cases where entities are defined at // the beginning and then used throughout a document ref = Optional.of(resolveReference(node.get(REF).asText(), node, root)); } else { ref = Optional.empty(); } // use TYPE of this node if (node.has(TYPE)) { final JsonNode typeNode = node.get(TYPE); List types = new ArrayList<>(); // array of types if (typeNode.isArray()) { final Iterator elements = typeNode.elements(); while (elements.hasNext()) { types.add(elements.next().asText()); } } // single type else if (typeNode.isTextual()) { types.add(typeNode.asText()); } for (String type : types) { // set field type switch (type) { case TYPE_NULL: typeSet.add(Types.VOID); break; case TYPE_BOOLEAN: typeSet.add(Types.BOOLEAN); break; case TYPE_STRING: if (node.has(FORMAT)) { typeSet.add(convertStringFormat(location, node.get(FORMAT))); } else if (node.has(CONTENT_ENCODING)) { typeSet.add( convertStringEncoding(location, node.get(CONTENT_ENCODING))); } else { typeSet.add(Types.STRING); } break; case TYPE_NUMBER: typeSet.add(Types.BIG_DEC); break; case TYPE_INTEGER: // use BigDecimal for easier interoperability // without affecting the correctness of the result typeSet.add(Types.BIG_DEC); break; case TYPE_OBJECT: typeSet.add(convertObject(location, node, root)); break; case TYPE_ARRAY: typeSet.add(convertArray(location, node, root)); break; default: throw new IllegalArgumentException( "Unsupported type '" + node.get(TYPE).asText() + "' in node: " + location); } } } // use TYPE of reference as fallback if present else { ref.filter(r -> r.has(TYPE)) .ifPresent(r -> typeSet.add(convertType(node.get(REF).asText(), r, root))); } // simple interpretation of ONE_OF for supporting "object or null" if (node.has(ONE_OF) && node.get(ONE_OF).isArray()) { final TypeInformation[] types = convertTypes(location + '/' + ONE_OF, node.get(ONE_OF), root); typeSet.addAll(Arrays.asList(types)); } // use ONE_OF of reference as fallback else if (ref.isPresent() && ref.get().has(ONE_OF) && ref.get().get(ONE_OF).isArray()) { final TypeInformation[] types = convertTypes( node.get(REF).asText() + '/' + ONE_OF, ref.get().get(ONE_OF), root); typeSet.addAll(Arrays.asList(types)); } // validate no union types or extending if (node.has(ALL_OF) || node.has(ANY_OF) || node.has(NOT) || node.has(EXTENDS) || node.has(DISALLOW)) { throw new IllegalArgumentException( "Union types are such as '" + ALL_OF + "', '" + ANY_OF + "' etc. " + "and extending are not supported yet."); } // only a type (with null) is supported yet final List> types = new ArrayList<>(typeSet); if (types.size() == 0) { throw new IllegalArgumentException("No type could be found in node:" + location); } else if (types.size() > 2 || (types.size() == 2 && !types.contains(Types.VOID))) { throw new IllegalArgumentException( "Union types with more than just a null type are not supported yet."); } // return the first non-void type or void if (types.size() == 2 && types.get(0) == Types.VOID) { return types.get(1); } else { return types.get(0); } } private static TypeInformation convertObject( String location, JsonNode node, JsonNode root) { // validate properties if (!node.has(PROPERTIES)) { return Types.ROW(); } if (!node.isObject()) { throw new IllegalArgumentException( "Invalid '" + PROPERTIES + "' property for object type in node: " + location); } final JsonNode props = node.get(PROPERTIES); final String[] names = new String[props.size()]; final TypeInformation[] types = new TypeInformation[props.size()]; final Iterator> fieldIter = props.fields(); int i = 0; while (fieldIter.hasNext()) { final Map.Entry subNode = fieldIter.next(); // set field name names[i] = subNode.getKey(); // set type types[i] = convertType(location + '/' + subNode.getKey(), subNode.getValue(), root); i++; } // validate that object does not contain additional properties if (node.has(ADDITIONAL_PROPERTIES) && node.get(ADDITIONAL_PROPERTIES).isBoolean() && node.get(ADDITIONAL_PROPERTIES).asBoolean()) { throw new IllegalArgumentException( "An object must not allow additional properties in node: " + location); } return Types.ROW_NAMED(names, types); } private static TypeInformation convertArray(String location, JsonNode node, JsonNode root) { // validate items if (!node.has(ITEMS)) { throw new IllegalArgumentException( "Arrays must specify an '" + ITEMS + "' property in node: " + location); } final JsonNode items = node.get(ITEMS); // list (translated to object array) if (items.isObject()) { final TypeInformation elementType = convertType(location + '/' + ITEMS, items, root); // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings return Types.OBJECT_ARRAY(elementType); } // tuple (translated to row) else if (items.isArray()) { final TypeInformation[] types = convertTypes(location + '/' + ITEMS, items, root); // validate that array does not contain additional items if (node.has(ADDITIONAL_ITEMS) && node.get(ADDITIONAL_ITEMS).isBoolean() && node.get(ADDITIONAL_ITEMS).asBoolean()) { throw new IllegalArgumentException( "An array tuple must not allow additional items in node: " + location); } return Types.ROW(types); } throw new IllegalArgumentException( "Invalid type for '" + ITEMS + "' property in node: " + location); } private static TypeInformation convertStringFormat(String location, JsonNode node) { if (!node.isTextual()) { throw new IllegalArgumentException( "Invalid '" + FORMAT + "' property in node: " + location); } switch (node.asText()) { case FORMAT_DATE: return Types.SQL_DATE; case FORMAT_TIME: return Types.SQL_TIME; case FORMAT_DATE_TIME: return Types.SQL_TIMESTAMP; default: return Types.STRING; // unlikely that we will support other formats in the future } } private static TypeInformation convertStringEncoding(String location, JsonNode node) { if (!node.isTextual()) { throw new IllegalArgumentException( "Invalid '" + CONTENT_ENCODING + "' property in node: " + location); } // "If the instance value is a string, this property defines that the string SHOULD // be interpreted as binary data and decoded using the encoding named by this property." switch (node.asText()) { case CONTENT_ENCODING_BASE64: return Types.PRIMITIVE_ARRAY(Types.BYTE); default: // we fail hard here: // this gives us the chance to support more encodings in the future without problems // of backwards compatibility throw new IllegalArgumentException( "Invalid encoding '" + node.asText() + "' in node: " + location); } } private static JsonNode resolveReference(String ref, JsonNode origin, JsonNode root) { if (!ref.startsWith("#")) { throw new IllegalArgumentException( "Only JSON schemes with simple references " + "(one indirection in the same document) are supported yet. But was: " + ref); } final String path = ref.substring(1); final JsonNode foundNode = root.at(path); if (foundNode.isMissingNode()) { throw new IllegalArgumentException("Could not find reference: " + ref); } // prevent obvious cyclic references if (foundNode == origin) { throw new IllegalArgumentException("Cyclic references are not supported:" + ref); } return foundNode; } private static TypeInformation[] convertTypes( String location, JsonNode arrayNode, JsonNode root) { final TypeInformation[] types = new TypeInformation[arrayNode.size()]; final Iterator elements = arrayNode.elements(); int i = 0; while (elements.hasNext()) { final TypeInformation elementType = convertType(location + '[' + i + ']', elements.next(), root); types[i] = elementType; i += 1; } return types; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy