All Downloads are FREE. Search and download functionalities are using the official Maven repository.

software.amazon.s3tables.iceberg.S3TablesLocationProvider Maven / Gradle / Ivy

There is a newer version: 0.1.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package software.amazon.s3tables.iceberg;

import java.nio.charset.StandardCharsets;
import java.util.Map;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.io.LocationProvider;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.hash.HashCode;
import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
import org.apache.iceberg.util.LocationUtil;

/**
 * A copy of https://code.amazon.com/packages/Aws157Iceberg/commits/c764ee512b20e298e7a426d12458d658e74d0d43
 * which is in upstream Iceberg, but not yet in all older Iceberg versions.
 * 
 * This location provider provides data locations that are optimized for S3 performance. Both
 * General Purpose buckets and Directory buckets will see better throughput and autoscaling behavior
 * than using the generic ObjectStoreLocationProvider.
 *
 * 

The data location is resolved as follows. Data files are written directly at this path with no * other intermediate directories created. * *

    *
  1. {@link TableProperties#WRITE_DATA_LOCATION} *
  2. tableLocation + "/data" *
* * The data file is placed immediately under the data location. Partition names are not * included. The data filename is prefixed with a 24-character binary hash, which ensures that files * written to S3 are equally distributed across many prefixes in the S3 bucket. * *

For example, with tableLocation s3://my-bucket/my-table, an example data file * could look like * s3://my-bucket/my-table/data/011101101010001111101000-00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet * . */ public class S3TablesLocationProvider implements LocationProvider { private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); // the starting index of the lower 24-bits of a 32-bit binary string private static final int HASH_BINARY_STRING_START_INDEX = 8; private final String storageLocation; public S3TablesLocationProvider(String tableLocation, Map properties) { this.storageLocation = LocationUtil.stripTrailingSlash(dataLocation(properties, tableLocation)); } @Override public String newDataLocation(PartitionSpec spec, StructLike partitionData, String filename) { return newDataLocation(filename); } @Override public String newDataLocation(String filename) { String hash = computeHash(filename); return String.format("%s/%s-%s", storageLocation, hash, filename); } private static String dataLocation(Map properties, String tableLocation) { String dataLocation = properties.get(TableProperties.WRITE_DATA_LOCATION); if (dataLocation == null) { dataLocation = String.format("%s/data", tableLocation); } return dataLocation; } @VisibleForTesting String computeHash(String fileName) { HashCode hashCode = HASH_FUNC.hashString(fileName, StandardCharsets.UTF_8); int hash = hashCode.asInt(); // {@link Integer#toBinaryString} excludes leading zeros, which we want to preserve. // force the first bit to be set to get around that. String hashAsBinaryString = Integer.toBinaryString(hash | Integer.MIN_VALUE); return hashAsBinaryString.substring(HASH_BINARY_STRING_START_INDEX); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy