All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.filesystem.azure.AzureFileSystem Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.filesystem.azure;

import com.azure.core.http.HttpClient;
import com.azure.core.http.rest.PagedIterable;
import com.azure.core.util.ClientOptions;
import com.azure.core.util.TracingOptions;
import com.azure.storage.blob.BlobClient;
import com.azure.storage.blob.BlobContainerClient;
import com.azure.storage.blob.BlobContainerClientBuilder;
import com.azure.storage.blob.models.BlobItem;
import com.azure.storage.blob.models.ListBlobsOptions;
import com.azure.storage.common.Utility;
import com.azure.storage.file.datalake.DataLakeDirectoryClient;
import com.azure.storage.file.datalake.DataLakeFileClient;
import com.azure.storage.file.datalake.DataLakeFileSystemClient;
import com.azure.storage.file.datalake.DataLakeServiceClient;
import com.azure.storage.file.datalake.DataLakeServiceClientBuilder;
import com.azure.storage.file.datalake.models.DataLakeRequestConditions;
import com.azure.storage.file.datalake.models.DataLakeStorageException;
import com.azure.storage.file.datalake.models.ListPathsOptions;
import com.azure.storage.file.datalake.models.PathItem;
import com.azure.storage.file.datalake.options.DataLakePathDeleteOptions;
import com.google.common.collect.ImmutableSet;
import io.airlift.units.DataSize;
import io.trino.filesystem.FileIterator;
import io.trino.filesystem.Location;
import io.trino.filesystem.TrinoFileSystem;
import io.trino.filesystem.TrinoInputFile;
import io.trino.filesystem.TrinoOutputFile;

import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalLong;
import java.util.Set;

import static com.azure.storage.common.implementation.Constants.HeaderConstants.ETAG_WILDCARD;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static io.trino.filesystem.azure.AzureUtils.handleAzureException;
import static io.trino.filesystem.azure.AzureUtils.isFileNotFoundException;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
import static java.util.UUID.randomUUID;
import static java.util.function.Predicate.not;

public class AzureFileSystem
        implements TrinoFileSystem
{
    private final HttpClient httpClient;
    private final TracingOptions tracingOptions;
    private final AzureAuth azureAuth;
    private final int readBlockSizeBytes;
    private final long writeBlockSizeBytes;
    private final int maxWriteConcurrency;
    private final long maxSingleUploadSizeBytes;

    public AzureFileSystem(
            HttpClient httpClient,
            TracingOptions tracingOptions,
            AzureAuth azureAuth,
            DataSize readBlockSize,
            DataSize writeBlockSize,
            int maxWriteConcurrency,
            DataSize maxSingleUploadSize)
    {
        this.httpClient = requireNonNull(httpClient, "httpClient is null");
        this.tracingOptions = requireNonNull(tracingOptions, "tracingOptions is null");
        this.azureAuth = requireNonNull(azureAuth, "azureAuth is null");
        this.readBlockSizeBytes = toIntExact(readBlockSize.toBytes());
        this.writeBlockSizeBytes = writeBlockSize.toBytes();
        checkArgument(maxWriteConcurrency >= 0, "maxWriteConcurrency is negative");
        this.maxWriteConcurrency = maxWriteConcurrency;
        this.maxSingleUploadSizeBytes = maxSingleUploadSize.toBytes();
    }

    @Override
    public TrinoInputFile newInputFile(Location location)
    {
        AzureLocation azureLocation = new AzureLocation(location);
        BlobClient client = createBlobClient(azureLocation);
        return new AzureInputFile(azureLocation, OptionalLong.empty(), client, readBlockSizeBytes);
    }

    @Override
    public TrinoInputFile newInputFile(Location location, long length)
    {
        AzureLocation azureLocation = new AzureLocation(location);
        BlobClient client = createBlobClient(azureLocation);
        return new AzureInputFile(azureLocation, OptionalLong.of(length), client, readBlockSizeBytes);
    }

    @Override
    public TrinoOutputFile newOutputFile(Location location)
    {
        AzureLocation azureLocation = new AzureLocation(location);
        BlobClient client = createBlobClient(azureLocation);
        return new AzureOutputFile(azureLocation, client, writeBlockSizeBytes, maxWriteConcurrency, maxSingleUploadSizeBytes);
    }

    @Override
    public void deleteFile(Location location)
            throws IOException
    {
        location.verifyValidFileLocation();
        AzureLocation azureLocation = new AzureLocation(location);
        BlobClient client = createBlobClient(azureLocation);
        try {
            client.delete();
        }
        catch (RuntimeException e) {
            if (isFileNotFoundException(e)) {
                return;
            }
            throw handleAzureException(e, "deleting file", azureLocation);
        }
    }

    @Override
    public void deleteDirectory(Location location)
            throws IOException
    {
        AzureLocation azureLocation = new AzureLocation(location);
        try {
            if (isHierarchicalNamespaceEnabled(azureLocation)) {
                deleteGen2Directory(azureLocation);
            }
            else {
                deleteBlobDirectory(azureLocation);
            }
        }
        catch (RuntimeException e) {
            throw handleAzureException(e, "deleting directory", azureLocation);
        }
    }

    private void deleteGen2Directory(AzureLocation location)
            throws IOException
    {
        DataLakeFileSystemClient fileSystemClient = createFileSystemClient(location);
        DataLakePathDeleteOptions deleteRecursiveOptions = new DataLakePathDeleteOptions().setIsRecursive(true);
        if (location.path().isEmpty()) {
            for (PathItem pathItem : fileSystemClient.listPaths()) {
                if (pathItem.isDirectory()) {
                    fileSystemClient.deleteDirectoryIfExistsWithResponse(pathItem.getName(), deleteRecursiveOptions, null, null);
                }
                else {
                    fileSystemClient.deleteFileIfExists(pathItem.getName());
                }
            }
        }
        else {
            DataLakeDirectoryClient directoryClient = fileSystemClient.getDirectoryClient(location.path());
            if (directoryClient.exists()) {
                if (!directoryClient.getProperties().isDirectory()) {
                    throw new IOException("Location is not a directory: " + location);
                }
                directoryClient.deleteIfExistsWithResponse(deleteRecursiveOptions, null, null);
            }
        }
    }

    private void deleteBlobDirectory(AzureLocation location)
    {
        String path = location.path();
        if (!path.isEmpty() && !path.endsWith("/")) {
            path += "/";
        }
        BlobContainerClient blobContainerClient = createBlobContainerClient(location);
        PagedIterable blobItems = blobContainerClient.listBlobs(new ListBlobsOptions().setPrefix(path), null);
        for (BlobItem item : blobItems) {
            String blobUrl = Utility.urlEncode(item.getName());
            blobContainerClient.getBlobClient(blobUrl).deleteIfExists();
        }
    }

    @Override
    public void renameFile(Location source, Location target)
            throws IOException
    {
        source.verifyValidFileLocation();
        target.verifyValidFileLocation();

        AzureLocation sourceLocation = new AzureLocation(source);
        AzureLocation targetLocation = new AzureLocation(target);
        if (!sourceLocation.account().equals(targetLocation.account())) {
            throw new IOException("Cannot rename across storage accounts");
        }
        if (!Objects.equals(sourceLocation.container(), targetLocation.container())) {
            throw new IOException("Cannot rename across storage account containers");
        }

        // DFS rename file works with all storage types
        renameGen2File(sourceLocation, targetLocation);
    }

    private void renameGen2File(AzureLocation source, AzureLocation target)
            throws IOException
    {
        try {
            DataLakeFileSystemClient fileSystemClient = createFileSystemClient(source);
            DataLakeFileClient dataLakeFileClient = fileSystemClient.getFileClient(source.path());
            if (dataLakeFileClient.getProperties().isDirectory()) {
                throw new IOException("Rename file from %s to %s, source is a directory".formatted(source, target));
            }

            fileSystemClient.createDirectoryIfNotExists(target.location().parentDirectory().path());
            dataLakeFileClient.renameWithResponse(
                    null,
                    target.path(),
                    null,
                    new DataLakeRequestConditions().setIfNoneMatch(ETAG_WILDCARD),
                    null,
                    null);
        }
        catch (RuntimeException e) {
            throw new IOException("Rename file from %s to %s failed".formatted(source, target), e);
        }
    }

    @Override
    public FileIterator listFiles(Location location)
            throws IOException
    {
        AzureLocation azureLocation = new AzureLocation(location);
        try {
            // blob API returns directories as blobs, so it cannot be used when Gen2 is enabled
            return isHierarchicalNamespaceEnabled(azureLocation)
                    ? listGen2Files(azureLocation)
                    : listBlobFiles(azureLocation);
        }
        catch (RuntimeException e) {
            throw handleAzureException(e, "listing files", azureLocation);
        }
    }

    private FileIterator listGen2Files(AzureLocation location)
            throws IOException
    {
        DataLakeFileSystemClient fileSystemClient = createFileSystemClient(location);
        PagedIterable pathItems;
        if (location.path().isEmpty()) {
            pathItems = fileSystemClient.listPaths(new ListPathsOptions().setRecursive(true), null);
        }
        else {
            DataLakeDirectoryClient directoryClient = fileSystemClient.getDirectoryClient(location.path());
            if (!directoryClient.exists()) {
                return FileIterator.empty();
            }
            if (!directoryClient.getProperties().isDirectory()) {
                throw new IOException("Location is not a directory: " + location);
            }
            pathItems = directoryClient.listPaths(true, false, null, null);
        }
        return new AzureDataLakeFileIterator(
                location,
                pathItems.stream()
                        .filter(not(PathItem::isDirectory))
                        .iterator());
    }

    private FileIterator listBlobFiles(AzureLocation location)
    {
        String path = location.path();
        if (!path.isEmpty() && !path.endsWith("/")) {
            path += "/";
        }
        PagedIterable blobItems = createBlobContainerClient(location).listBlobs(new ListBlobsOptions().setPrefix(path), null);
        return new AzureBlobFileIterator(location, blobItems.iterator());
    }

    @Override
    public Optional directoryExists(Location location)
            throws IOException
    {
        AzureLocation azureLocation = new AzureLocation(location);
        if (location.path().isEmpty()) {
            return Optional.of(true);
        }
        if (!isHierarchicalNamespaceEnabled(azureLocation)) {
            if (listFiles(location).hasNext()) {
                return Optional.of(true);
            }
            return Optional.empty();
        }

        try {
            DataLakeFileSystemClient fileSystemClient = createFileSystemClient(azureLocation);
            DataLakeFileClient fileClient = fileSystemClient.getFileClient(azureLocation.path());
            return Optional.of(fileClient.getProperties().isDirectory());
        }
        catch (DataLakeStorageException e) {
            if (e.getStatusCode() == 404) {
                return Optional.of(false);
            }
            throw handleAzureException(e, "checking directory existence", azureLocation);
        }
        catch (RuntimeException e) {
            throw handleAzureException(e, "checking directory existence", azureLocation);
        }
    }

    @Override
    public void createDirectory(Location location)
            throws IOException
    {
        AzureLocation azureLocation = new AzureLocation(location);
        if (!isHierarchicalNamespaceEnabled(azureLocation)) {
            return;
        }
        try {
            DataLakeFileSystemClient fileSystemClient = createFileSystemClient(azureLocation);
            DataLakeDirectoryClient directoryClient = fileSystemClient.createDirectoryIfNotExists(azureLocation.path());
            if (!directoryClient.getProperties().isDirectory()) {
                throw new IOException("Location is not a directory: " + azureLocation);
            }
        }
        catch (RuntimeException e) {
            throw handleAzureException(e, "creating directory", azureLocation);
        }
    }

    @Override
    public void renameDirectory(Location source, Location target)
            throws IOException
    {
        AzureLocation sourceLocation = new AzureLocation(source);
        AzureLocation targetLocation = new AzureLocation(target);
        if (!sourceLocation.account().equals(targetLocation.account())) {
            throw new IOException("Cannot rename across storage accounts");
        }
        if (!Objects.equals(sourceLocation.container(), targetLocation.container())) {
            throw new IOException("Cannot rename across storage account containers");
        }
        if (!isHierarchicalNamespaceEnabled(sourceLocation)) {
            throw new IOException("Azure non-hierarchical does not support directory renames");
        }
        if (sourceLocation.path().isEmpty() || targetLocation.path().isEmpty()) {
            throw new IOException("Cannot rename %s to %s".formatted(source, target));
        }

        try {
            DataLakeFileSystemClient fileSystemClient = createFileSystemClient(sourceLocation);
            DataLakeDirectoryClient directoryClient = fileSystemClient.getDirectoryClient(sourceLocation.path());
            if (!directoryClient.exists()) {
                throw new IOException("Source directory does not exist: " + source);
            }
            if (!directoryClient.getProperties().isDirectory()) {
                throw new IOException("Source is not a directory: " + source);
            }
            directoryClient.rename(null, targetLocation.path());
        }
        catch (RuntimeException e) {
            throw new IOException("Rename directory from %s to %s failed".formatted(source, target), e);
        }
    }

    @Override
    public Set listDirectories(Location location)
            throws IOException
    {
        AzureLocation azureLocation = new AzureLocation(location);
        try {
            // blob API returns directories as blobs, so it cannot be used when Gen2 is enabled
            return isHierarchicalNamespaceEnabled(azureLocation)
                    ? listGen2Directories(azureLocation)
                    : listBlobDirectories(azureLocation);
        }
        catch (RuntimeException e) {
            throw handleAzureException(e, "listing files", azureLocation);
        }
    }

    @Override
    public Optional createTemporaryDirectory(Location targetPath, String temporaryPrefix, String relativePrefix)
            throws IOException
    {
        AzureLocation azureLocation = new AzureLocation(targetPath);
        if (!isHierarchicalNamespaceEnabled(azureLocation)) {
            return Optional.empty();
        }

        // allow for absolute or relative temporary prefix
        Location temporary;
        if (temporaryPrefix.startsWith("/")) {
            String prefix = temporaryPrefix;
            while (prefix.startsWith("/")) {
                prefix = prefix.substring(1);
            }
            temporary = azureLocation.baseLocation().appendPath(prefix);
        }
        else {
            temporary = targetPath.appendPath(temporaryPrefix);
        }

        temporary = temporary.appendPath(randomUUID().toString());

        createDirectory(temporary);
        return Optional.of(temporary);
    }

    private Set listGen2Directories(AzureLocation location)
            throws IOException
    {
        DataLakeFileSystemClient fileSystemClient = createFileSystemClient(location);
        PagedIterable pathItems;
        if (location.path().isEmpty()) {
            pathItems = fileSystemClient.listPaths();
        }
        else {
            DataLakeDirectoryClient directoryClient = fileSystemClient.getDirectoryClient(location.path());
            if (!directoryClient.exists()) {
                return ImmutableSet.of();
            }
            if (!directoryClient.getProperties().isDirectory()) {
                throw new IOException("Location is not a directory: " + location);
            }
            pathItems = directoryClient.listPaths(false, false, null, null);
        }
        Location baseLocation = location.baseLocation();
        return pathItems.stream()
                .filter(PathItem::isDirectory)
                .map(item -> baseLocation.appendPath(item.getName() + "/"))
                .collect(toImmutableSet());
    }

    private Set listBlobDirectories(AzureLocation location)
    {
        String path = location.path();
        if (!path.isEmpty() && !path.endsWith("/")) {
            path += "/";
        }
        Location baseLocation = location.baseLocation();
        return createBlobContainerClient(location)
                .listBlobsByHierarchy(path).stream()
                .filter(BlobItem::isPrefix)
                .map(item -> baseLocation.appendPath(item.getName()))
                .collect(toImmutableSet());
    }

    private boolean isHierarchicalNamespaceEnabled(AzureLocation location)
            throws IOException
    {
        try {
            DataLakeFileSystemClient fileSystemClient = createFileSystemClient(location);
            return fileSystemClient.getDirectoryClient("/").exists();
        }
        catch (RuntimeException e) {
            throw new IOException("Checking whether hierarchical namespace is enabled for the location %s failed".formatted(location), e);
        }
    }

    private BlobClient createBlobClient(AzureLocation location)
    {
        // encode the path using the Azure url encoder utility
        String path = Utility.urlEncode(location.path());
        return createBlobContainerClient(location).getBlobClient(path);
    }

    private BlobContainerClient createBlobContainerClient(AzureLocation location)
    {
        requireNonNull(location, "location is null");

        BlobContainerClientBuilder builder = new BlobContainerClientBuilder()
                .httpClient(httpClient)
                .clientOptions(new ClientOptions().setTracingOptions(tracingOptions))
                .endpoint(String.format("https://%s.blob.core.windows.net", location.account()));
        azureAuth.setAuth(location.account(), builder);
        location.container().ifPresent(builder::containerName);
        return builder.buildClient();
    }

    private DataLakeFileSystemClient createFileSystemClient(AzureLocation location)
    {
        requireNonNull(location, "location is null");

        DataLakeServiceClientBuilder builder = new DataLakeServiceClientBuilder()
                .httpClient(httpClient)
                .clientOptions(new ClientOptions().setTracingOptions(tracingOptions))
                .endpoint(String.format("https://%s.dfs.core.windows.net", location.account()));
        azureAuth.setAuth(location.account(), builder);
        DataLakeServiceClient client = builder.buildClient();
        DataLakeFileSystemClient fileSystemClient = client.getFileSystemClient(location.container().orElseThrow());
        if (!fileSystemClient.exists()) {
            throw new IllegalArgumentException();
        }
        return fileSystemClient;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy