All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.drill.exec.util.EncodedSchemaPathSet Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.util;


import org.apache.drill.shaded.guava.com.google.common.io.BaseEncoding;
import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.exec.planner.physical.PlannerSettings;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

/**
 * This class provided utility methods to encode and decode a set of user specified
 * SchemaPaths to a set of encoded SchemaPaths with the following properties.
 * 
    *
  1. Valid Drill identifier as per its grammar with only one, root name segment. *
  2. A single identifier can not exceed 1024 characters in length. *
*

* Format of the encoded SchemaPath: *

$$ENC\d\dlt;base32 encoded input paths>
*

* We use Base-32 over Base-64 because the later's charset includes '\' and '+'. */ public class EncodedSchemaPathSet { private static final int ESTIMATED_ENCODED_SIZE = 1024; private static final String ENC_PREFIX = "$$ENC"; private static final String ENC_FORMAT_STRING = ENC_PREFIX + "%02d%s"; private static final int ENC_PREFIX_SIZE = ENC_PREFIX.length() + "00".length(); private static final int MAX_ENC_IDENTIFIER_SIZE = (PlannerSettings.DEFAULT_IDENTIFIER_MAX_LENGTH - ENC_PREFIX_SIZE); private static final int MAX_ENC_IDENTIFIER_COUNT = 100; // "$$ENC00*...$$ENC99*" private static final BaseEncoding CODEC = BaseEncoding.base32().omitPadding(); // no-padding version public static final String ENCODED_STAR_COLUMN = encode("*")[0]; /* * Performance of various methods of encoding a Java String to UTF-8 keeps changing * between releases, hence we'll encapsulate the actual methods within these functions * and use them everywhere in Drill */ private static final String UTF_8 = "utf-8"; private static byte[] encodeUTF(String input) { try { return input.getBytes(UTF_8); } catch (UnsupportedEncodingException e) { throw new DrillRuntimeException(e); // should never come to this } } private static String decodeUTF(byte[] input) { try { return new String(input, UTF_8); } catch (UnsupportedEncodingException e) { throw new DrillRuntimeException(e); // should never come to this } } private static String decodeUTF(byte[] input, int offset, int length) { try { return new String(input, offset, length, UTF_8); } catch (UnsupportedEncodingException e) { throw new DrillRuntimeException(e); // should never come to this } } /** * Returns the encoded array of SchemaPath identifiers from the input array of SchemaPath. *

* The returned identifiers have the following properties: *

    *
  • Each SchemaPath identifier in the array has only one single root NameSegment.
  • *
  • Maximum length of each such identifier is equal to the maximum length of Drill identifier (currently 1024).
  • *
*

* We take advantage of the fact that Java's modified utf-8 encoding can never contain * embedded null byte. * @see http://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8 */ public static String[] encode(final String... schemaPaths) { Preconditions.checkArgument(schemaPaths != null && schemaPaths.length > 0, "At least one schema path should be provided"); NoCopyByteArrayOutputStream out = new NoCopyByteArrayOutputStream(ESTIMATED_ENCODED_SIZE); int bufOffset = 1; // 1st byte is NULL for (String schemaPath : schemaPaths) { out.write(0); out.write(encodeUTF(schemaPath)); } out.close(); final int bufLen = out.size() - 1; // not counting the first NULL byte String encodedStr = CODEC.encode(out.getBuffer(), bufOffset, bufLen); assert !encodedStr.endsWith("=") : String.format("Encoded string '%s' ends with '='", encodedStr); return splitIdentifiers(encodedStr); } public static boolean isEncodedSchemaPath(SchemaPath schemaPath) { return schemaPath != null && isEncodedSchemaPath(schemaPath.getRootSegment().getNameSegment().getPath()); } public static boolean isEncodedSchemaPath(String schemaPath) { return schemaPath != null && schemaPath.startsWith(ENC_PREFIX); } /** * Returns the decoded Collection of SchemaPath from the input which * may contain a mix of encoded and non-encoded SchemaPaths. *

* The size of returned Collection is always equal to or greater than the * input array. *

* The non-encoded SchemaPaths are collated in the beginning to the returned * array, in the same order as that of the input array. */ public static Collection decode(final Collection encodedPaths) { String[] schemaPathStrings = new String[encodedPaths.size()]; Iterator encodedPathsItr = encodedPaths.iterator(); for (int i = 0; i < schemaPathStrings.length; i++) { SchemaPath schemaPath = encodedPathsItr.next(); if (schemaPath.getRootSegmentPath().startsWith(ENC_PREFIX)) { // encoded schema path contains only root segment schemaPathStrings[i] = schemaPath.getRootSegmentPath(); } else { schemaPathStrings[i] = schemaPath.toExpr(); } } String[] decodedStrings = decode(schemaPathStrings); if (decodedStrings == schemaPathStrings) { return encodedPaths; // return the original collection as no encoded SchemaPath was found } else { ImmutableList.Builder builder = new ImmutableList.Builder<>(); for (String decodedString : decodedStrings) { if ("*".equals(decodedString) || "`*`".equals(decodedString)) { builder.add(SchemaPath.STAR_COLUMN); } else { builder.add(SchemaPath.parseFromString(decodedString)); } } return builder.build(); } } /** * Returns the decoded array of SchemaPath strings from the input which * may contain a mix of encoded and non-encoded SchemaPaths. *

* The size of returned array is always equal to or greater than the * input array. *

* The non-encoded SchemaPaths are collated in the beginning to the returned * array, in the same order as that of the input array. */ public static String[] decode(final String... encodedPaths) { Preconditions.checkArgument(encodedPaths != null && encodedPaths.length > 0, "At least one encoded path should be provided"); StringBuilder sb = new StringBuilder(ESTIMATED_ENCODED_SIZE); // As the encoded schema path move across components, they could get reordered. // Sorting ensures that the original order is restored before concatenating the // components back to the full encoded String. Arrays.sort(encodedPaths); List decodedPathList = Lists.newArrayList(); for (String encodedPath : encodedPaths) { if (encodedPath.startsWith(ENC_PREFIX)) { sb.append(encodedPath, ENC_PREFIX_SIZE, encodedPath.length()); } else { decodedPathList.add(encodedPath); } } if (sb.length() > 0) { byte[] decodedBytes; try { decodedBytes = CODEC.decode(sb); } catch (IllegalArgumentException e) { throw new DrillRuntimeException(String.format( "Unable to decode the input strings as encoded schema paths:\n%s", Arrays.asList(encodedPaths)), e); } int start = 0, index = 0; for (; index < decodedBytes.length; index++) { if (decodedBytes[index] == 0 && index - start > 0) { decodedPathList.add(decodeUTF(decodedBytes, start, index-start)); start = index + 1; } } if (index - start > 0) { String lastSchemaPath = decodeUTF(decodedBytes, start, index-start).trim(); if (!lastSchemaPath.isEmpty()) { decodedPathList.add(lastSchemaPath); } } return decodedPathList.toArray(new String[decodedPathList.size()]); } else { // original list did not have any encoded path, return as is return encodedPaths; } } /** * Splits the input string so that the length of each encoded string, * including the signature prefix is less than or equal to MAX_DRILL_IDENTIFIER_SIZE. */ private static String[] splitIdentifiers(String input) { if (input.length() < MAX_ENC_IDENTIFIER_SIZE) { return new String[] { String.format(ENC_FORMAT_STRING, 0, input) }; } int splitsCount = (int) Math.ceil(input.length() / (double)MAX_ENC_IDENTIFIER_SIZE); if (splitsCount > MAX_ENC_IDENTIFIER_COUNT) { throw new DrillRuntimeException(String.format( "Encoded size of the SchemaPath identifier '%s' exceeded maximum value.", input)); } String[] result = new String[splitsCount]; for (int i = 0, startIdx = 0; i < result.length; i++, startIdx += MAX_ENC_IDENTIFIER_SIZE) { // TODO: see if we can avoid memcpy due to input.substring() call result[i] = String.format(ENC_FORMAT_STRING, i, input.substring(startIdx, Math.min(input.length(), startIdx + MAX_ENC_IDENTIFIER_SIZE))); } return result; } /** * Optimized version of Java's ByteArrayOutputStream which returns the underlying * byte array instead of making a copy */ private static class NoCopyByteArrayOutputStream extends ByteArrayOutputStream { public NoCopyByteArrayOutputStream(int size) { super(size); } public byte[] getBuffer() { return buf; } public int size() { return count; } @Override public void write(int b) { super.write(b); } @Override public void write(byte[] b) { super.write(b, 0, b.length); } @Override public void close() { try { super.close(); } catch (IOException e) { throw new DrillRuntimeException(e); // should never come to this } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy