org.apache.avro.SchemaNormalization Maven / Gradle / Ivy
Show all versions of spark-core Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/** Collection of static methods for generating the cannonical form of
* schemas (see {@link #toParsingForm}) -- and fingerprints of cannonical
* forms ({@link #fingerprint}).
*/
public class SchemaNormalization {
private SchemaNormalization() {}
/** Returns "Parsing Canonical Form" of a schema as defined by Avro
* spec. */
public static String toParsingForm(Schema s) {
try {
Map env = new HashMap();
return build(env, s, new StringBuilder()).toString();
} catch (IOException e) {
// Shouldn't happen, b/c StringBuilder can't throw IOException
throw new RuntimeException(e);
}
}
/** Returns a fingerprint of a string of bytes. This string is
* presumed to contain a canonical form of a schema. The
* algorithm used to compute the fingerprint is selected by the
* argument fpName. If fpName equals the string
* "CRC-64-AVRO"
, then the result of {@link #fingerprint64} is
* returned in little-endian format. Otherwise, fpName is
* used as an algorithm name for {@link
* MessageDigest#getInstance(String)}, which will throw
* NoSuchAlgorithmException
if it doesn't recognize
* the name.
* Recommended Avro practice dictiates that
* "CRC-64-AVRO"
is used for 64-bit fingerprints,
* "MD5"
is used for 128-bit fingerprints, and
* "SHA-256"
is used for 256-bit fingerprints. */
public static byte[] fingerprint(String fpName, byte[] data)
throws NoSuchAlgorithmException
{
if (fpName.equals("CRC-64-AVRO")) {
long fp = fingerprint64(data);
byte[] result = new byte[8];
for (int i = 0; i < 8; i++) {
result[i] = (byte)fp;
fp >>= 8;
}
return result;
}
MessageDigest md = MessageDigest.getInstance(fpName);
return md.digest(data);
}
/** Returns the 64-bit Rabin Fingerprint (as recommended in the Avro
* spec) of a byte string. */
public static long fingerprint64(byte[] data) {
long result = EMPTY64;
for (byte b: data)
result = (result >>> 8) ^ FP64.FP_TABLE[(int)(result ^ b) & 0xff];
return result;
}
/** Returns {@link #fingerprint} applied to the parsing canonical form
* of the supplied schema. */
public static byte[] parsingFingerprint(String fpName, Schema s)
throws NoSuchAlgorithmException
{
try {
return fingerprint(fpName, toParsingForm(s).getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) { throw new RuntimeException(e); }
}
/** Returns {@link #fingerprint64} applied to the parsing canonical form
* of the supplied schema. */
public static long parsingFingerprint64(Schema s) {
try {
return fingerprint64(toParsingForm(s).getBytes("UTF-8"));
} catch (java.io.UnsupportedEncodingException e)
{ throw new RuntimeException(e); }
}
private static Appendable build(Map env, Schema s,
Appendable o) throws IOException {
boolean firstTime = true;
Schema.Type st = s.getType();
switch (st) {
default: // boolean, bytes, double, float, int, long, null, string
return o.append('"').append(st.getName()).append('"');
case UNION:
o.append('[');
for (Schema b: s.getTypes()) {
if (! firstTime) o.append(','); else firstTime = false;
build(env, b, o);
}
return o.append(']');
case ARRAY: case MAP:
o.append("{\"type\":\"").append(st.getName()).append("\"");
if (st == Schema.Type.ARRAY)
build(env, s.getElementType(), o.append(",\"items\":"));
else build(env, s.getValueType(), o.append(",\"values\":"));
return o.append("}");
case ENUM: case FIXED: case RECORD:
String name = s.getFullName();
if (env.get(name) != null) return o.append(env.get(name));
String qname = "\""+name+"\"";
env.put(name, qname);
o.append("{\"name\":").append(qname);
o.append(",\"type\":\"").append(st.getName()).append("\"");
if (st == Schema.Type.ENUM) {
o.append(",\"symbols\":[");
for (String enumSymbol: s.getEnumSymbols()) {
if (! firstTime) o.append(','); else firstTime = false;
o.append('"').append(enumSymbol).append('"');
}
o.append("]");
} else if (st == Schema.Type.FIXED) {
o.append(",\"size\":").append(Integer.toString(s.getFixedSize()));
} else { // st == Schema.Type.RECORD
o.append(",\"fields\":[");
for (Schema.Field f: s.getFields()) {
if (! firstTime) o.append(','); else firstTime = false;
o.append("{\"name\":\"").append(f.name()).append("\"");
build(env, f.schema(), o.append(",\"type\":")).append("}");
}
o.append("]");
}
return o.append("}");
}
}
final static long EMPTY64 = 0xc15d213aa4d7a795L;
/* An inner class ensures that FP_TABLE initialized only when needed. */
private static class FP64 {
private static final long[] FP_TABLE = new long[256];
static {
for (int i = 0; i < 256; i++) {
long fp = i;
for (int j = 0; j < 8; j++) {
long mask = -(fp & 1L);
fp = (fp >>> 1) ^ (EMPTY64 & mask);
}
FP_TABLE[i] = fp;
}
}
}
}