org.apache.hudi.utilities.schema.SchemaRegistryProvider Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities.schema;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.function.SerializableFunctionUnchecked;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.VisibleForTesting;
import org.apache.hudi.common.util.collection.Triple;
import org.apache.hudi.internal.schema.HoodieSchemaException;
import org.apache.hudi.utilities.config.HoodieSchemaProviderConfig;
import org.apache.hudi.utilities.exception.HoodieSchemaFetchException;
import io.confluent.kafka.schemaregistry.ParsedSchema;
import io.confluent.kafka.schemaregistry.avro.AvroSchemaProvider;
import io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient;
import io.confluent.kafka.schemaregistry.client.SchemaMetadata;
import io.confluent.kafka.schemaregistry.client.SchemaRegistryClient;
import io.confluent.kafka.schemaregistry.client.rest.RestService;
import io.confluent.kafka.schemaregistry.json.JsonSchemaProvider;
import io.confluent.kafka.schemaregistry.protobuf.ProtobufSchemaProvider;
import org.apache.avro.Schema;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.SSLContexts;
import org.apache.spark.api.java.JavaSparkContext;
import javax.net.ssl.SSLSocketFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.UnrecoverableKeyException;
import java.security.cert.CertificateException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.apache.hudi.common.util.ConfigUtils.checkRequiredConfigProperties;
import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys;
/**
* Obtains latest schema from the Confluent/Kafka schema-registry.
*
* https://github.com/confluentinc/schema-registry
*/
public class SchemaRegistryProvider extends SchemaProvider {
private static final Pattern URL_PATTERN = Pattern.compile("(.*/)subjects/(.*)/versions/(.*)");
private static final String LATEST = "latest";
/**
* Configs supported.
*/
public static class Config {
@Deprecated
public static final String SRC_SCHEMA_REGISTRY_URL_PROP =
HoodieSchemaProviderConfig.SRC_SCHEMA_REGISTRY_URL.key();
@Deprecated
public static final String TARGET_SCHEMA_REGISTRY_URL_PROP =
HoodieSchemaProviderConfig.TARGET_SCHEMA_REGISTRY_URL.key();
@Deprecated
public static final String SCHEMA_CONVERTER_PROP =
HoodieSchemaProviderConfig.SCHEMA_CONVERTER.key();
public static final String SSL_KEYSTORE_LOCATION_PROP = "schema.registry.ssl.keystore.location";
public static final String SSL_TRUSTSTORE_LOCATION_PROP = "schema.registry.ssl.truststore.location";
public static final String SSL_KEYSTORE_PASSWORD_PROP = "schema.registry.ssl.keystore.password";
public static final String SSL_TRUSTSTORE_PASSWORD_PROP = "schema.registry.ssl.truststore.password";
public static final String SSL_KEY_PASSWORD_PROP = "schema.registry.ssl.key.password";
}
private final Option schemaConverter;
private final SerializableFunctionUnchecked restServiceProvider;
private final SerializableFunctionUnchecked registryClientProvider;
public SchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc) {
super(props, jssc);
checkRequiredConfigProperties(props, Collections.singletonList(HoodieSchemaProviderConfig.SRC_SCHEMA_REGISTRY_URL));
if (config.containsKey(Config.SSL_KEYSTORE_LOCATION_PROP)
|| config.containsKey(Config.SSL_TRUSTSTORE_LOCATION_PROP)) {
setUpSSLStores();
}
String schemaConverter = getStringWithAltKeys(config, HoodieSchemaProviderConfig.SCHEMA_CONVERTER, true);
this.schemaConverter = !StringUtils.isNullOrEmpty(schemaConverter)
? Option.of((SchemaConverter) ReflectionUtils.loadClass(
schemaConverter, new Class>[] {TypedProperties.class}, config))
: Option.empty();
this.restServiceProvider = RestService::new;
this.registryClientProvider = restService -> new CachedSchemaRegistryClient(restService, 100,
Arrays.asList(new ProtobufSchemaProvider(), new JsonSchemaProvider(), new AvroSchemaProvider()), null, null);
}
@VisibleForTesting
SchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc,
Option schemaConverter,
SerializableFunctionUnchecked restServiceProvider,
SerializableFunctionUnchecked registryClientProvider) {
super(props, jssc);
checkRequiredConfigProperties(props, Collections.singletonList(HoodieSchemaProviderConfig.SRC_SCHEMA_REGISTRY_URL));
this.schemaConverter = schemaConverter;
this.restServiceProvider = restServiceProvider;
this.registryClientProvider = registryClientProvider;
}
@FunctionalInterface
public interface SchemaConverter {
/**
* Convert original schema string to avro schema string.
*
* @param schema original schema returned from the registry
* @return avro schema string
*/
String convert(ParsedSchema schema) throws IOException;
}
public Schema parseSchemaFromRegistry(String registryUrl) {
String schema = fetchSchemaFromRegistry(registryUrl);
return new Schema.Parser().parse(schema);
}
/**
* The method takes the provided url {@code registryUrl} and gets the schema from the schema registry using that url.
* If the caller provides userInfo credentials in the url (e.g "https://foo:[email protected]") then the credentials
* are extracted the url using the Matcher and the extracted credentials are set on the request as an Authorization
* header.
*
* @param registryUrl
* @return the Schema in String form.
*/
public String fetchSchemaFromRegistry(String registryUrl) {
try {
Matcher matcher = Pattern.compile("://(.*?)@").matcher(registryUrl);
Triple registryInfo;
String creds = null;
if (matcher.find()) {
creds = matcher.group(1);
String urlWithoutCreds = registryUrl.replace(creds + "@", "");
registryInfo = getUrlSubjectAndVersion(urlWithoutCreds);
} else {
registryInfo = getUrlSubjectAndVersion(registryUrl);
}
String url = registryInfo.getLeft();
RestService restService = getRestService(url);
if (creds != null) {
setAuthorizationHeader(creds, restService);
}
String subject = registryInfo.getMiddle();
String version = registryInfo.getRight();
SchemaRegistryClient registryClient = registryClientProvider.apply(restService);
SchemaMetadata schemaMetadata = version.equals(LATEST) ? registryClient.getLatestSchemaMetadata(subject) : registryClient.getSchemaMetadata(subject, Integer.parseInt(version));
ParsedSchema parsedSchema = registryClient.parseSchema(schemaMetadata.getSchemaType(), schemaMetadata.getSchema(), schemaMetadata.getReferences())
.orElseThrow(() -> new HoodieSchemaException("Failed to parse schema from registry"));
if (schemaConverter.isPresent()) {
return schemaConverter.get().convert(parsedSchema);
} else {
return parsedSchema.canonicalString();
}
} catch (Exception e) {
throw new HoodieSchemaFetchException("Failed to fetch schema from registry", e);
}
}
private Triple getUrlSubjectAndVersion(String registryUrl) {
// url may be list of urls
String[] splitRegistryUrls = registryUrl.split(",");
String subjectName = null;
String version = null;
List urls = new ArrayList<>(splitRegistryUrls.length);
// url will end with /subjects/{subject}/versions/{version}
for (String url : splitRegistryUrls) {
Matcher matcher = URL_PATTERN.matcher(url);
if (!matcher.matches()) {
throw new HoodieSchemaFetchException("Failed to extract subject name and version from registry url");
}
urls.add(matcher.group(1));
subjectName = matcher.group(2);
version = matcher.group(3);
}
if (subjectName == null) {
throw new HoodieSchemaFetchException("Failed to extract subject name from registry url");
}
return Triple.of(String.join(",", urls), subjectName, version);
}
private SSLSocketFactory sslSocketFactory;
protected RestService getRestService(String url) {
RestService restService = restServiceProvider.apply(url);
if (sslSocketFactory != null) {
restService.setSslSocketFactory(sslSocketFactory);
return restService;
}
return restService;
}
protected void setAuthorizationHeader(String creds, RestService restService) {
String encodedAuth = Base64.getEncoder().encodeToString(creds.getBytes(StandardCharsets.UTF_8));
restService.setHttpHeaders(Collections.singletonMap("Authorization", "Basic " + encodedAuth));
}
private void setUpSSLStores() {
SSLContextBuilder sslContextBuilder = SSLContexts.custom();
try {
if (config.containsKey(Config.SSL_TRUSTSTORE_LOCATION_PROP)) {
sslContextBuilder.loadTrustMaterial(
new File(config.getString(Config.SSL_TRUSTSTORE_LOCATION_PROP)),
config.getString(Config.SSL_TRUSTSTORE_PASSWORD_PROP).toCharArray(),
new TrustSelfSignedStrategy());
}
if (config.containsKey(Config.SSL_KEYSTORE_LOCATION_PROP)) {
sslContextBuilder.loadKeyMaterial(
new File(config.getString(Config.SSL_KEYSTORE_LOCATION_PROP)),
config.getString(Config.SSL_KEYSTORE_PASSWORD_PROP).toCharArray(),
config.getString(Config.SSL_KEY_PASSWORD_PROP).toCharArray()
);
}
sslSocketFactory = sslContextBuilder.build().getSocketFactory();
} catch (UnrecoverableKeyException | IOException | KeyStoreException | NoSuchAlgorithmException | CertificateException | KeyManagementException e) {
throw new RuntimeException(e);
}
}
@Override
public Schema getSourceSchema() {
String registryUrl = getStringWithAltKeys(config, HoodieSchemaProviderConfig.SRC_SCHEMA_REGISTRY_URL);
try {
return parseSchemaFromRegistry(registryUrl);
} catch (Exception e) {
throw new HoodieSchemaFetchException(String.format(
"Error reading source schema from registry. Please check %s is configured correctly. Truncated URL: %s",
Config.SRC_SCHEMA_REGISTRY_URL_PROP,
StringUtils.truncate(registryUrl, 10, 10)), e);
}
}
@Override
public Schema getTargetSchema() {
String registryUrl = getStringWithAltKeys(config, HoodieSchemaProviderConfig.SRC_SCHEMA_REGISTRY_URL);
String targetRegistryUrl =
getStringWithAltKeys(config, HoodieSchemaProviderConfig.TARGET_SCHEMA_REGISTRY_URL, registryUrl);
try {
return parseSchemaFromRegistry(targetRegistryUrl);
} catch (Exception e) {
throw new HoodieSchemaFetchException(String.format(
"Error reading target schema from registry. Please check %s is configured correctly. If that is not configured then check %s. Truncated URL: %s",
Config.SRC_SCHEMA_REGISTRY_URL_PROP,
Config.TARGET_SCHEMA_REGISTRY_URL_PROP,
StringUtils.truncate(targetRegistryUrl, 10, 10)), e);
}
}
}