All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.SchemaNormalization Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro;

import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

/** Collection of static methods for generating the cannonical form of
 * schemas (see {@link #toParsingForm}) -- and fingerprints of cannonical
 * forms ({@link #fingerprint}).
 */
public class SchemaNormalization {

  private SchemaNormalization() {}

  /** Returns "Parsing Canonical Form" of a schema as defined by Avro
    * spec. */
  public static String toParsingForm(Schema s) {
    try {
      Map env = new HashMap();
      return build(env, s, new StringBuilder()).toString();
    } catch (IOException e) {
      // Shouldn't happen, b/c StringBuilder can't throw IOException
      throw new RuntimeException(e);
    }
  }

  /** Returns a fingerprint of a string of bytes.  This string is
    * presumed to contain a canonical form of a schema.  The
    * algorithm used to compute the fingerprint is selected by the
    * argument fpName.  If fpName equals the string
    * "CRC-64-AVRO", then the result of {@link #fingerprint64} is
    * returned in little-endian format.  Otherwise, fpName is
    * used as an algorithm name for {@link
    * MessageDigest#getInstance(String)}, which will throw
    * NoSuchAlgorithmException if it doesn't recognize
    * the name.
    * 

Recommended Avro practice dictiates that * "CRC-64-AVRO" is used for 64-bit fingerprints, * "MD5" is used for 128-bit fingerprints, and * "SHA-256" is used for 256-bit fingerprints. */ public static byte[] fingerprint(String fpName, byte[] data) throws NoSuchAlgorithmException { if (fpName.equals("CRC-64-AVRO")) { long fp = fingerprint64(data); byte[] result = new byte[8]; for (int i = 0; i < 8; i++) { result[i] = (byte)fp; fp >>= 8; } return result; } MessageDigest md = MessageDigest.getInstance(fpName); return md.digest(data); } /** Returns the 64-bit Rabin Fingerprint (as recommended in the Avro * spec) of a byte string. */ public static long fingerprint64(byte[] data) { long result = EMPTY64; for (byte b: data) result = (result >>> 8) ^ FP64.FP_TABLE[(int)(result ^ b) & 0xff]; return result; } /** Returns {@link #fingerprint} applied to the parsing canonical form * of the supplied schema. */ public static byte[] parsingFingerprint(String fpName, Schema s) throws NoSuchAlgorithmException { try { return fingerprint(fpName, toParsingForm(s).getBytes("UTF-8")); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** Returns {@link #fingerprint64} applied to the parsing canonical form * of the supplied schema. */ public static long parsingFingerprint64(Schema s) { try { return fingerprint64(toParsingForm(s).getBytes("UTF-8")); } catch (java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); } } private static Appendable build(Map env, Schema s, Appendable o) throws IOException { boolean firstTime = true; Schema.Type st = s.getType(); switch (st) { default: // boolean, bytes, double, float, int, long, null, string return o.append('"').append(st.getName()).append('"'); case UNION: o.append('['); for (Schema b: s.getTypes()) { if (! firstTime) o.append(','); else firstTime = false; build(env, b, o); } return o.append(']'); case ARRAY: case MAP: o.append("{\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ARRAY) build(env, s.getElementType(), o.append(",\"items\":")); else build(env, s.getValueType(), o.append(",\"values\":")); return o.append("}"); case ENUM: case FIXED: case RECORD: String name = s.getFullName(); if (env.get(name) != null) return o.append(env.get(name)); String qname = "\""+name+"\""; env.put(name, qname); o.append("{\"name\":").append(qname); o.append(",\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ENUM) { o.append(",\"symbols\":["); for (String enumSymbol: s.getEnumSymbols()) { if (! firstTime) o.append(','); else firstTime = false; o.append('"').append(enumSymbol).append('"'); } o.append("]"); } else if (st == Schema.Type.FIXED) { o.append(",\"size\":").append(Integer.toString(s.getFixedSize())); } else { // st == Schema.Type.RECORD o.append(",\"fields\":["); for (Schema.Field f: s.getFields()) { if (! firstTime) o.append(','); else firstTime = false; o.append("{\"name\":\"").append(f.name()).append("\""); build(env, f.schema(), o.append(",\"type\":")).append("}"); } o.append("]"); } return o.append("}"); } } final static long EMPTY64 = 0xc15d213aa4d7a795L; /* An inner class ensures that FP_TABLE initialized only when needed. */ private static class FP64 { private static final long[] FP_TABLE = new long[256]; static { for (int i = 0; i < 256; i++) { long fp = i; for (int j = 0; j < 8; j++) { long mask = -(fp & 1L); fp = (fp >>> 1) ^ (EMPTY64 & mask); } FP_TABLE[i] = fp; } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy