org.apache.avro.SchemaNormalization Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/**
* Collection of static methods for generating the canonical form of schemas
* (see {@link #toParsingForm}) -- and fingerprints of canonical forms
* ({@link #fingerprint}).
*/
public class SchemaNormalization {
private SchemaNormalization() {
}
/**
* Returns "Parsing Canonical Form" of a schema as defined by Avro spec.
*/
public static String toParsingForm(Schema s) {
try {
Map env = new HashMap<>();
return build(env, s, new StringBuilder()).toString();
} catch (IOException e) {
// Shouldn't happen, b/c StringBuilder can't throw IOException
throw new RuntimeException(e);
}
}
/**
* Returns a fingerprint of a string of bytes. This string is presumed to
* contain a canonical form of a schema. The algorithm used to compute the
* fingerprint is selected by the argument fpName. If fpName
* equals the string "CRC-64-AVRO"
, then the result of
* {@link #fingerprint64} is returned in little-endian format. Otherwise,
* fpName is used as an algorithm name for
* {@link MessageDigest#getInstance(String)}, which will throw
* NoSuchAlgorithmException
if it doesn't recognize the name.
*
* Recommended Avro practice dictates that "CRC-64-AVRO"
is used
* for 64-bit fingerprints, "MD5"
is used for 128-bit fingerprints,
* and "SHA-256"
is used for 256-bit fingerprints.
*/
public static byte[] fingerprint(String fpName, byte[] data) throws NoSuchAlgorithmException {
if (fpName.equals("CRC-64-AVRO")) {
long fp = fingerprint64(data);
byte[] result = new byte[8];
for (int i = 0; i < 8; i++) {
result[i] = (byte) fp;
fp >>= 8;
}
return result;
}
MessageDigest md = MessageDigest.getInstance(fpName);
return md.digest(data);
}
/**
* Returns the 64-bit Rabin Fingerprint (as recommended in the Avro spec) of a
* byte string.
*/
public static long fingerprint64(byte[] data) {
long result = EMPTY64;
for (byte b : data)
result = (result >>> 8) ^ FP64.FP_TABLE[(int) (result ^ b) & 0xff];
return result;
}
/**
* Returns {@link #fingerprint} applied to the parsing canonical form of the
* supplied schema.
*/
public static byte[] parsingFingerprint(String fpName, Schema s) throws NoSuchAlgorithmException {
return fingerprint(fpName, toParsingForm(s).getBytes(StandardCharsets.UTF_8));
}
/**
* Returns {@link #fingerprint64} applied to the parsing canonical form of the
* supplied schema.
*/
public static long parsingFingerprint64(Schema s) {
return fingerprint64(toParsingForm(s).getBytes(StandardCharsets.UTF_8));
}
private static Appendable build(Map env, Schema s, Appendable o) throws IOException {
boolean firstTime = true;
Schema.Type st = s.getType();
switch (st) {
default: // boolean, bytes, double, float, int, long, null, string
return o.append('"').append(st.getName()).append('"');
case UNION:
o.append('[');
for (Schema b : s.getTypes()) {
if (!firstTime)
o.append(',');
else
firstTime = false;
build(env, b, o);
}
return o.append(']');
case ARRAY:
case MAP:
o.append("{\"type\":\"").append(st.getName()).append("\"");
if (st == Schema.Type.ARRAY)
build(env, s.getElementType(), o.append(",\"items\":"));
else
build(env, s.getValueType(), o.append(",\"values\":"));
return o.append("}");
case ENUM:
case FIXED:
case RECORD:
String name = s.getFullName();
if (env.get(name) != null)
return o.append(env.get(name));
String qname = "\"" + name + "\"";
env.put(name, qname);
o.append("{\"name\":").append(qname);
o.append(",\"type\":\"").append(st.getName()).append("\"");
if (st == Schema.Type.ENUM) {
o.append(",\"symbols\":[");
for (String enumSymbol : s.getEnumSymbols()) {
if (!firstTime)
o.append(',');
else
firstTime = false;
o.append('"').append(enumSymbol).append('"');
}
o.append("]");
} else if (st == Schema.Type.FIXED) {
o.append(",\"size\":").append(Integer.toString(s.getFixedSize()));
} else { // st == Schema.Type.RECORD
o.append(",\"fields\":[");
for (Schema.Field f : s.getFields()) {
if (!firstTime)
o.append(',');
else
firstTime = false;
o.append("{\"name\":\"").append(f.name()).append("\"");
build(env, f.schema(), o.append(",\"type\":")).append("}");
}
o.append("]");
}
return o.append("}");
}
}
final static long EMPTY64 = 0xc15d213aa4d7a795L;
/* An inner class ensures that FP_TABLE initialized only when needed. */
private static class FP64 {
private static final long[] FP_TABLE = new long[256];
static {
for (int i = 0; i < 256; i++) {
long fp = i;
for (int j = 0; j < 8; j++) {
long mask = -(fp & 1L);
fp = (fp >>> 1) ^ (EMPTY64 & mask);
}
FP_TABLE[i] = fp;
}
}
}
}