All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.SchemaNormalization Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro;

import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

/**
 * Collection of static methods for generating the canonical form of schemas
 * (see {@link #toParsingForm}) -- and fingerprints of canonical forms
 * ({@link #fingerprint}).
 */
public class SchemaNormalization {

  private SchemaNormalization() {
  }

  /**
   * Returns "Parsing Canonical Form" of a schema as defined by Avro spec.
   */
  public static String toParsingForm(Schema s) {
    try {
      Map env = new HashMap<>();
      return build(env, s, new StringBuilder()).toString();
    } catch (IOException e) {
      // Shouldn't happen, b/c StringBuilder can't throw IOException
      throw new RuntimeException(e);
    }
  }

  /**
   * Returns a fingerprint of a string of bytes. This string is presumed to
   * contain a canonical form of a schema. The algorithm used to compute the
   * fingerprint is selected by the argument fpName. If fpName
   * equals the string "CRC-64-AVRO", then the result of
   * {@link #fingerprint64} is returned in little-endian format. Otherwise,
   * fpName is used as an algorithm name for
   * {@link MessageDigest#getInstance(String)}, which will throw
   * NoSuchAlgorithmException if it doesn't recognize the name.
   * 

* Recommended Avro practice dictates that "CRC-64-AVRO" is used * for 64-bit fingerprints, "MD5" is used for 128-bit fingerprints, * and "SHA-256" is used for 256-bit fingerprints. */ public static byte[] fingerprint(String fpName, byte[] data) throws NoSuchAlgorithmException { if (fpName.equals("CRC-64-AVRO")) { long fp = fingerprint64(data); byte[] result = new byte[8]; for (int i = 0; i < 8; i++) { result[i] = (byte) fp; fp >>= 8; } return result; } MessageDigest md = MessageDigest.getInstance(fpName); return md.digest(data); } /** * Returns the 64-bit Rabin Fingerprint (as recommended in the Avro spec) of a * byte string. */ public static long fingerprint64(byte[] data) { long result = EMPTY64; for (byte b : data) result = (result >>> 8) ^ FP64.FP_TABLE[(int) (result ^ b) & 0xff]; return result; } /** * Returns {@link #fingerprint} applied to the parsing canonical form of the * supplied schema. */ public static byte[] parsingFingerprint(String fpName, Schema s) throws NoSuchAlgorithmException { return fingerprint(fpName, toParsingForm(s).getBytes(StandardCharsets.UTF_8)); } /** * Returns {@link #fingerprint64} applied to the parsing canonical form of the * supplied schema. */ public static long parsingFingerprint64(Schema s) { return fingerprint64(toParsingForm(s).getBytes(StandardCharsets.UTF_8)); } private static Appendable build(Map env, Schema s, Appendable o) throws IOException { boolean firstTime = true; Schema.Type st = s.getType(); switch (st) { default: // boolean, bytes, double, float, int, long, null, string return o.append('"').append(st.getName()).append('"'); case UNION: o.append('['); for (Schema b : s.getTypes()) { if (!firstTime) o.append(','); else firstTime = false; build(env, b, o); } return o.append(']'); case ARRAY: case MAP: o.append("{\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ARRAY) build(env, s.getElementType(), o.append(",\"items\":")); else build(env, s.getValueType(), o.append(",\"values\":")); return o.append("}"); case ENUM: case FIXED: case RECORD: String name = s.getFullName(); if (env.get(name) != null) return o.append(env.get(name)); String qname = "\"" + name + "\""; env.put(name, qname); o.append("{\"name\":").append(qname); o.append(",\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ENUM) { o.append(",\"symbols\":["); for (String enumSymbol : s.getEnumSymbols()) { if (!firstTime) o.append(','); else firstTime = false; o.append('"').append(enumSymbol).append('"'); } o.append("]"); } else if (st == Schema.Type.FIXED) { o.append(",\"size\":").append(Integer.toString(s.getFixedSize())); } else { // st == Schema.Type.RECORD o.append(",\"fields\":["); for (Schema.Field f : s.getFields()) { if (!firstTime) o.append(','); else firstTime = false; o.append("{\"name\":\"").append(f.name()).append("\""); build(env, f.schema(), o.append(",\"type\":")).append("}"); } o.append("]"); } return o.append("}"); } } final static long EMPTY64 = 0xc15d213aa4d7a795L; /* An inner class ensures that FP_TABLE initialized only when needed. */ private static class FP64 { private static final long[] FP_TABLE = new long[256]; static { for (int i = 0; i < 256; i++) { long fp = i; for (int j = 0; j < 8; j++) { long mask = -(fp & 1L); fp = (fp >>> 1) ^ (EMPTY64 & mask); } FP_TABLE[i] = fp; } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy