All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dinky.shaded.paimon.fs.hadoop.HadoopFileIO Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.fs.hadoop;

import org.dinky.shaded.paimon.annotation.VisibleForTesting;
import org.dinky.shaded.paimon.catalog.CatalogContext;
import org.dinky.shaded.paimon.fs.FileIO;
import org.dinky.shaded.paimon.fs.FileStatus;
import org.dinky.shaded.paimon.fs.Path;
import org.dinky.shaded.paimon.fs.PositionOutputStream;
import org.dinky.shaded.paimon.fs.SeekableInputStream;
import org.dinky.shaded.paimon.hadoop.SerializableConfiguration;
import org.dinky.shaded.paimon.utils.ReflectionUtils;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Options;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicReference;

/** Hadoop {@link FileIO}. */
public class HadoopFileIO implements FileIO {

    private static final long serialVersionUID = 1L;

    protected SerializableConfiguration hadoopConf;

    protected transient volatile FileSystem fs;

    @VisibleForTesting
    public void setFileSystem(FileSystem fs) {
        this.fs = fs;
    }

    @Override
    public boolean isObjectStore() {
        return false;
    }

    @Override
    public void configure(CatalogContext context) {
        this.hadoopConf = new SerializableConfiguration(context.hadoopConf());
    }

    @Override
    public SeekableInputStream newInputStream(Path path) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        return new HadoopSeekableInputStream(getFileSystem(hadoopPath).open(hadoopPath));
    }

    @Override
    public PositionOutputStream newOutputStream(Path path, boolean overwrite) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        return new HadoopPositionOutputStream(
                getFileSystem(hadoopPath).create(hadoopPath, overwrite));
    }

    @Override
    public FileStatus getFileStatus(Path path) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        return new HadoopFileStatus(getFileSystem(hadoopPath).getFileStatus(hadoopPath));
    }

    @Override
    public FileStatus[] listStatus(Path path) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        FileStatus[] statuses = new FileStatus[0];
        org.apache.hadoop.fs.FileStatus[] hadoopStatuses =
                getFileSystem(hadoopPath).listStatus(hadoopPath);
        if (hadoopStatuses != null) {
            statuses = new FileStatus[hadoopStatuses.length];
            for (int i = 0; i < hadoopStatuses.length; i++) {
                statuses[i] = new HadoopFileStatus(hadoopStatuses[i]);
            }
        }
        return statuses;
    }

    @Override
    public boolean exists(Path path) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        return getFileSystem(hadoopPath).exists(hadoopPath);
    }

    @Override
    public boolean delete(Path path, boolean recursive) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        return getFileSystem(hadoopPath).delete(hadoopPath, recursive);
    }

    @Override
    public boolean mkdirs(Path path) throws IOException {
        org.apache.hadoop.fs.Path hadoopPath = path(path);
        return getFileSystem(hadoopPath).mkdirs(hadoopPath);
    }

    @Override
    public boolean rename(Path src, Path dst) throws IOException {
        org.apache.hadoop.fs.Path hadoopSrc = path(src);
        org.apache.hadoop.fs.Path hadoopDst = path(dst);
        return getFileSystem(hadoopSrc).rename(hadoopSrc, hadoopDst);
    }

    @Override
    public void overwriteFileUtf8(Path path, String content) throws IOException {
        boolean success = tryAtomicOverwriteViaRename(path, content);
        if (!success) {
            FileIO.super.overwriteFileUtf8(path, content);
        }
    }

    private org.apache.hadoop.fs.Path path(Path path) {
        return new org.apache.hadoop.fs.Path(path.toUri());
    }

    private FileSystem getFileSystem(org.apache.hadoop.fs.Path path) throws IOException {
        if (fs == null) {
            synchronized (this) {
                if (fs == null) {
                    fs = createFileSystem(path);
                }
            }
        }
        return fs;
    }

    protected FileSystem createFileSystem(org.apache.hadoop.fs.Path path) throws IOException {
        return path.getFileSystem(hadoopConf.get());
    }

    private static class HadoopSeekableInputStream extends SeekableInputStream {

        /**
         * Minimum amount of bytes to skip forward before we issue a seek instead of discarding
         * read.
         *
         * 

The current value is just a magic number. In the long run, this value could become * configurable, but for now it is a conservative, relatively small value that should bring * safe improvements for small skips (e.g. in reading meta data), that would hurt the most * with frequent seeks. * *

The optimal value depends on the DFS implementation and configuration plus the * underlying filesystem. For now, this number is chosen "big enough" to provide * improvements for smaller seeks, and "small enough" to avoid disadvantages over real * seeks. While the minimum should be the page size, a true optimum per system would be the * amounts of bytes the can be consumed sequentially within the seektime. Unfortunately, * seektime is not constant and devices, OS, and DFS potentially also use read buffers and * read-ahead. */ private static final int MIN_SKIP_BYTES = 1024 * 1024; private final FSDataInputStream in; private HadoopSeekableInputStream(FSDataInputStream in) { this.in = in; } @Override public void seek(long seekPos) throws IOException { // We do some optimizations to avoid that some implementations of distributed FS perform // expensive seeks when they are actually not needed. long delta = seekPos - getPos(); if (delta > 0L && delta <= MIN_SKIP_BYTES) { // Instead of a small forward seek, we skip over the gap skipFully(delta); } else if (delta != 0L) { // For larger gaps and backward seeks, we do a real seek forceSeek(seekPos); } // Do nothing if delta is zero. } @Override public long getPos() throws IOException { return in.getPos(); } @Override public int read() throws IOException { return in.read(); } @Override public int read(byte[] b, int off, int len) throws IOException { return in.read(b, off, len); } @Override public void close() throws IOException { in.close(); } /** * Positions the stream to the given location. In contrast to {@link #seek(long)}, this * method will always issue a "seek" command to the dfs and may not replace it by {@link * #skip(long)} for small seeks. * *

Notice that the underlying DFS implementation can still decide to do skip instead of * seek. * * @param seekPos the position to seek to. */ public void forceSeek(long seekPos) throws IOException { in.seek(seekPos); } /** * Skips over a given amount of bytes in the stream. * * @param bytes the number of bytes to skip. */ public void skipFully(long bytes) throws IOException { while (bytes > 0) { bytes -= in.skip(bytes); } } } private static class HadoopPositionOutputStream extends PositionOutputStream { private final FSDataOutputStream out; private HadoopPositionOutputStream(FSDataOutputStream out) { this.out = out; } @Override public long getPos() throws IOException { return out.getPos(); } @Override public void write(int b) throws IOException { out.write(b); } @Override public void write(byte[] b) throws IOException { out.write(b); } @Override public void write(byte[] b, int off, int len) throws IOException { out.write(b, off, len); } @Override public void flush() throws IOException { out.hflush(); } @Override public void close() throws IOException { out.close(); } } private static class HadoopFileStatus implements FileStatus { private final org.apache.hadoop.fs.FileStatus status; private HadoopFileStatus(org.apache.hadoop.fs.FileStatus status) { this.status = status; } @Override public long getLen() { return status.getLen(); } @Override public boolean isDir() { return status.isDirectory(); } @Override public Path getPath() { return new Path(status.getPath().toUri()); } @Override public long getModificationTime() { return status.getModificationTime(); } } // ============================== extra methods =================================== private transient volatile AtomicReference renameMethodRef; public boolean tryAtomicOverwriteViaRename(Path dst, String content) throws IOException { org.apache.hadoop.fs.Path hadoopDst = path(dst); FileSystem fs = getFileSystem(hadoopDst); if (renameMethodRef == null) { synchronized (this) { if (renameMethodRef == null) { Method method; // Implementation in FileSystem is incorrect, not atomic, Object Storage like S3 // and OSS not override it // DistributedFileSystem and ViewFileSystem override the rename method to public // and implement correct renaming try { method = ReflectionUtils.getMethod(fs.getClass(), "rename", 3); } catch (NoSuchMethodException e) { method = null; } renameMethodRef = new AtomicReference<>(method); } } } Method renameMethod = renameMethodRef.get(); if (renameMethod == null) { return false; } boolean renameDone = false; // write tempPath Path tempPath = dst.createTempPath(); org.apache.hadoop.fs.Path hadoopTemp = path(tempPath); try { try (PositionOutputStream out = newOutputStream(tempPath, false)) { OutputStreamWriter writer = new OutputStreamWriter(out, StandardCharsets.UTF_8); writer.write(content); writer.flush(); } renameMethod.invoke( fs, hadoopTemp, hadoopDst, new Options.Rename[] {Options.Rename.OVERWRITE}); renameDone = true; // TODO: this is a workaround of HADOOP-16255 - remove this when HADOOP-16255 is // resolved tryRemoveCrcFile(hadoopTemp); return true; } catch (InvocationTargetException | IllegalAccessException e) { throw new IOException(e); } finally { if (!renameDone) { deleteQuietly(tempPath); } } } /** @throws IOException if a fatal exception occurs. Will try to ignore most exceptions. */ @SuppressWarnings("CatchMayIgnoreException") private void tryRemoveCrcFile(org.apache.hadoop.fs.Path path) throws IOException { try { final org.apache.hadoop.fs.Path checksumFile = new org.apache.hadoop.fs.Path( path.getParent(), String.format(".%s.crc", path.getName())); if (fs.exists(checksumFile)) { // checksum file exists, deleting it fs.delete(checksumFile, true); // recursive=true } } catch (Throwable t) { if (t instanceof VirtualMachineError || t instanceof ThreadDeath || t instanceof LinkageError) { throw t; } // else, ignore - we are removing crc file as "best-effort" } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy