All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.compress.archivers.tar.TarFile Maven / Gradle / Ivy

Go to download

Apache Commons Compress software defines an API for working with compression and archive formats. These include: bzip2, gzip, pack200, lzma, xz, Snappy, traditional Unix Compress, DEFLATE, DEFLATE64, LZ4, Brotli, Zstandard and ar, cpio, jar, tar, zip, dump, 7z, arj.

There is a newer version: 1.27.1
Show newest version
/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.apache.commons.compress.archivers.tar;

import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.compress.archivers.zip.ZipEncoding;
import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
import org.apache.commons.compress.utils.ArchiveUtils;
import org.apache.commons.compress.utils.BoundedArchiveInputStream;
import org.apache.commons.compress.utils.BoundedInputStream;
import org.apache.commons.compress.utils.BoundedSeekableByteChannelInputStream;
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel;

/**
 * Provides random access to UNIX archives.
 *
 * @since 1.21
 */
public class TarFile implements Closeable {

    private final class BoundedTarEntryInputStream extends BoundedArchiveInputStream {

        private final SeekableByteChannel channel;

        private final TarArchiveEntry entry;

        private long entryOffset;

        private int currentSparseInputStreamIndex;

        BoundedTarEntryInputStream(final TarArchiveEntry entry, final SeekableByteChannel channel) throws IOException {
            super(entry.getDataOffset(), entry.getRealSize());
            if (channel.size() - entry.getSize() < entry.getDataOffset()) {
                throw new IOException("entry size exceeds archive size");
            }
            this.entry = entry;
            this.channel = channel;
        }

        @Override
        protected int read(final long pos, final ByteBuffer buf) throws IOException {
            if (entryOffset >= entry.getRealSize()) {
                return -1;
            }

            final int totalRead;
            if (entry.isSparse()) {
                totalRead = readSparse(entryOffset, buf, buf.limit());
            } else {
                totalRead = readArchive(pos, buf);
            }

            if (totalRead == -1) {
                if (buf.array().length > 0) {
                    throw new IOException("Truncated TAR archive");
                }
                setAtEOF(true);
            } else {
                entryOffset += totalRead;
                buf.flip();
            }
            return totalRead;
        }

        private int readArchive(final long pos, final ByteBuffer buf) throws IOException {
            channel.position(pos);
            return channel.read(buf);
        }

        private int readSparse(final long pos, final ByteBuffer buf, final int numToRead) throws IOException {
            // if there are no actual input streams, just read from the original archive
            final List entrySparseInputStreams = sparseInputStreams.get(entry.getName());
            if (entrySparseInputStreams == null || entrySparseInputStreams.isEmpty()) {
                return readArchive(entry.getDataOffset() + pos, buf);
            }

            if (currentSparseInputStreamIndex >= entrySparseInputStreams.size()) {
                return -1;
            }

            final InputStream currentInputStream = entrySparseInputStreams.get(currentSparseInputStreamIndex);
            final byte[] bufArray = new byte[numToRead];
            final int readLen = currentInputStream.read(bufArray);
            if (readLen != -1) {
                buf.put(bufArray, 0, readLen);
            }

            // if the current input stream is the last input stream,
            // just return the number of bytes read from current input stream
            if (currentSparseInputStreamIndex == entrySparseInputStreams.size() - 1) {
                return readLen;
            }

            // if EOF of current input stream is meet, open a new input stream and recursively call read
            if (readLen == -1) {
                currentSparseInputStreamIndex++;
                return readSparse(pos, buf, numToRead);
            }

            // if the rest data of current input stream is not long enough, open a new input stream
            // and recursively call read
            if (readLen < numToRead) {
                currentSparseInputStreamIndex++;
                final int readLenOfNext = readSparse(pos + readLen, buf, numToRead - readLen);
                if (readLenOfNext == -1) {
                    return readLen;
                }

                return readLen + readLenOfNext;
            }

            // if the rest data of current input stream is enough(which means readLen == len), just return readLen
            return readLen;
        }
    }

    private static final int SMALL_BUFFER_SIZE = 256;

    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];

    private final SeekableByteChannel archive;

    /**
     * The encoding of the tar file
     */
    private final ZipEncoding zipEncoding;

    private final LinkedList entries = new LinkedList<>();

    private final int blockSize;

    private final boolean lenient;

    private final int recordSize;

    private final ByteBuffer recordBuffer;

    // the global sparse headers, this is only used in PAX Format 0.X
    private final List globalSparseHeaders = new ArrayList<>();

    private boolean hasHitEOF;

    /**
     * The meta-data about the current entry
     */
    private TarArchiveEntry currEntry;

    // the global PAX header
    private Map globalPaxHeaders = new HashMap<>();

    private final Map> sparseInputStreams = new HashMap<>();

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content) throws IOException {
        this(new SeekableInMemoryByteChannel(content));
    }

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
     *                {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content, final boolean lenient) throws IOException {
        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param content  the content to use
     * @param encoding the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content, final String encoding) throws IOException {
        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive the file of the archive to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive) throws IOException {
        this(archive.toPath());
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive the file of the archive to use
     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
     *                {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive, final boolean lenient) throws IOException {
        this(archive.toPath(), lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive  the file of the archive to use
     * @param encoding the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive, final String encoding) throws IOException {
        this(archive.toPath(), encoding);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
     *                    {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath, final boolean lenient) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @param encoding    the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath, final String encoding) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final SeekableByteChannel content) throws IOException {
        this(content, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive    the seekable byte channel to use
     * @param blockSize  the blocks size to use
     * @param recordSize the record size to use
     * @param encoding   the encoding to use
     * @param lenient    when set to true illegal values for group/userid, mode, device numbers and timestamp will be ignored and the fields set to
     *                   {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final SeekableByteChannel archive, final int blockSize, final int recordSize, final String encoding, final boolean lenient)
            throws IOException {
        this.archive = archive;
        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
        this.recordSize = recordSize;
        this.recordBuffer = ByteBuffer.allocate(this.recordSize);
        this.blockSize = blockSize;
        this.lenient = lenient;

        TarArchiveEntry entry;
        while ((entry = getNextTarEntry()) != null) {
            entries.add(entry);
        }
    }

    /**
     * Update the current entry with the read pax headers
     *
     * @param headers       Headers read from the pax header
     * @param sparseHeaders Sparse headers read from pax header
     */
    private void applyPaxHeadersToCurrentEntry(final Map headers, final List sparseHeaders) throws IOException {
        currEntry.updateEntryFromPaxHeaders(headers);
        currEntry.setSparseHeaders(sparseHeaders);
    }

    /**
     * Build the input streams consisting of all-zero input streams and non-zero input streams. When reading from the non-zero input streams, the data is
     * actually read from the original input stream. The size of each input stream is introduced by the sparse headers.
     *
     * @implNote Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the 0 size input streams because they are
     *           meaningless.
     */
    private void buildSparseInputStreams() throws IOException {
        final List streams = new ArrayList<>();

        final List sparseHeaders = currEntry.getOrderedSparseHeaders();

        // Stream doesn't need to be closed at all as it doesn't use any resources
        final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); // NOSONAR
        // logical offset into the extracted entry
        long offset = 0;
        long numberOfZeroBytesInSparseEntry = 0;
        for (final TarArchiveStructSparse sparseHeader : sparseHeaders) {
            final long zeroBlockSize = sparseHeader.getOffset() - offset;
            if (zeroBlockSize < 0) {
                // sparse header says to move backwards inside the extracted entry
                throw new IOException("Corrupted struct sparse detected");
            }

            // only store the zero block if it is not empty
            if (zeroBlockSize > 0) {
                streams.add(new BoundedInputStream(zeroInputStream, zeroBlockSize));
                numberOfZeroBytesInSparseEntry += zeroBlockSize;
            }

            // only store the input streams with non-zero size
            if (sparseHeader.getNumbytes() > 0) {
                final long start = currEntry.getDataOffset() + sparseHeader.getOffset() - numberOfZeroBytesInSparseEntry;
                if (start + sparseHeader.getNumbytes() < start) {
                    // possible integer overflow
                    throw new IOException("Unreadable TAR archive, sparse block offset or length too big");
                }
                streams.add(new BoundedSeekableByteChannelInputStream(start, sparseHeader.getNumbytes(), archive));
            }

            offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
        }

        sparseInputStreams.put(currEntry.getName(), streams);
    }

    @Override
    public void close() throws IOException {
        archive.close();
    }

    /**
     * This method is invoked once the end of the archive is hit, it tries to consume the remaining bytes under the assumption that the tool creating this
     * archive has padded the last block.
     */
    private void consumeRemainderOfLastBlock() throws IOException {
        final long bytesReadOfLastBlock = archive.position() % blockSize;
        if (bytesReadOfLastBlock > 0) {
            repositionForwardBy(blockSize - bytesReadOfLastBlock);
        }
    }

    /**
     * Gets all TAR Archive Entries from the TarFile
     *
     * @return All entries from the tar file
     */
    public List getEntries() {
        return new ArrayList<>(entries);
    }

    /**
     * Gets the input stream for the provided Tar Archive Entry.
     *
     * @param entry Entry to get the input stream from
     * @return Input stream of the provided entry
     * @throws IOException Corrupted TAR archive. Can't read entry.
     */
    public InputStream getInputStream(final TarArchiveEntry entry) throws IOException {
        try {
            return new BoundedTarEntryInputStream(entry, archive);
        } catch (final RuntimeException ex) {
            throw new IOException("Corrupted TAR archive. Can't read entry", ex);
        }
    }

    /**
     * Gets the next entry in this tar archive as long name data.
     *
     * @return The next entry in the archive as long name data, or null.
     * @throws IOException on error
     */
    private byte[] getLongNameData() throws IOException {
        final ByteArrayOutputStream longName = new ByteArrayOutputStream();
        int length;
        try (InputStream in = getInputStream(currEntry)) {
            while ((length = in.read(smallBuf)) >= 0) {
                longName.write(smallBuf, 0, length);
            }
        }
        getNextTarEntry();
        if (currEntry == null) {
            // Bugzilla: 40334
            // Malformed tar file - long entry name not followed by entry
            return null;
        }
        byte[] longNameData = longName.toByteArray();
        // remove trailing null terminator(s)
        length = longNameData.length;
        while (length > 0 && longNameData[length - 1] == 0) {
            --length;
        }
        if (length != longNameData.length) {
            longNameData = Arrays.copyOf(longNameData, length);
        }
        return longNameData;
    }

    /**
     * Gets the next entry in this tar archive. This will skip to the end of the current entry, if there is one, and place the position of the channel at the
     * header of the next entry, and read the header and instantiate a new TarEntry from the header bytes and return that entry. If there are no more entries in
     * the archive, null will be returned to indicate that the end of the archive has been reached.
     *
     * @return The next TarEntry in the archive, or null if there is no next entry.
     * @throws IOException when reading the next TarEntry fails
     */
    private TarArchiveEntry getNextTarEntry() throws IOException {
        if (isAtEOF()) {
            return null;
        }

        if (currEntry != null) {
            // Skip to the end of the entry
            repositionForwardTo(currEntry.getDataOffset() + currEntry.getSize());
            throwExceptionIfPositionIsNotInArchive();
            skipRecordPadding();
        }

        final ByteBuffer headerBuf = getRecord();
        if (null == headerBuf) {
            /* hit EOF */
            currEntry = null;
            return null;
        }

        try {
            final long position = archive.position();
            currEntry = new TarArchiveEntry(globalPaxHeaders, headerBuf.array(), zipEncoding, lenient, position);
        } catch (final IllegalArgumentException e) {
            throw new IOException("Error detected parsing the header", e);
        }

        if (currEntry.isGNULongLinkEntry()) {
            final byte[] longLinkData = getLongNameData();
            if (longLinkData == null) {
                // Bugzilla: 40334
                // Malformed tar file - long link entry name not followed by
                // entry
                return null;
            }
            currEntry.setLinkName(zipEncoding.decode(longLinkData));
        }

        if (currEntry.isGNULongNameEntry()) {
            final byte[] longNameData = getLongNameData();
            if (longNameData == null) {
                // Bugzilla: 40334
                // Malformed tar file - long entry name not followed by
                // entry
                return null;
            }

            // COMPRESS-509 : the name of directories should end with '/'
            final String name = zipEncoding.decode(longNameData);
            currEntry.setName(name);
            if (currEntry.isDirectory() && !name.endsWith("/")) {
                currEntry.setName(name + "/");
            }
        }

        if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
            readGlobalPaxHeaders();
        }

        try {
            if (currEntry.isPaxHeader()) { // Process Pax headers
                paxHeaders();
            } else if (!globalPaxHeaders.isEmpty()) {
                applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
            }
        } catch (final NumberFormatException e) {
            throw new IOException("Error detected parsing the pax header", e);
        }

        if (currEntry.isOldGNUSparse()) { // Process sparse files
            readOldGNUSparse();
        }

        return currEntry;
    }

    /**
     * Gets the next record in this tar archive. This will skip over any remaining data in the current entry, if there is one, and place the input stream at the
     * header of the next entry.
     *
     * 

* If there are no more entries in the archive, null will be returned to indicate that the end of the archive has been reached. At the same time the * {@code hasHitEOF} marker will be set to true. *

* * @return The next TarEntry in the archive, or null if there is no next entry. * @throws IOException when reading the next TarEntry fails */ private ByteBuffer getRecord() throws IOException { ByteBuffer headerBuf = readRecord(); setAtEOF(isEOFRecord(headerBuf)); if (isAtEOF() && headerBuf != null) { // Consume rest tryToConsumeSecondEOFRecord(); consumeRemainderOfLastBlock(); headerBuf = null; } return headerBuf; } protected final boolean isAtEOF() { return hasHitEOF; } private boolean isDirectory() { return currEntry != null && currEntry.isDirectory(); } private boolean isEOFRecord(final ByteBuffer headerBuf) { return headerBuf == null || ArchiveUtils.isArrayZero(headerBuf.array(), recordSize); } /** *

* For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) may appear multi times, and they look like: * *

     * GNU.sparse.size=size
     * GNU.sparse.numblocks=numblocks
     * repeat numblocks times
     *   GNU.sparse.offset=offset
     *   GNU.sparse.numbytes=numbytes
     * end repeat
     * 
* *

* For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map * *

     * GNU.sparse.map
     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
     * 
* *

* For PAX Format 1.X:
* The sparse map itself is stored in the file data block, preceding the actual file data. It consists of a series of decimal numbers delimited by newlines. * The map is padded with nulls to the nearest block boundary. The first number gives the number of entries in the map. Following are map entries, each one * consisting of two numbers giving the offset and size of the data block it describes. * * @throws IOException */ private void paxHeaders() throws IOException { List sparseHeaders = new ArrayList<>(); final Map headers; try (InputStream input = getInputStream(currEntry)) { headers = TarUtils.parsePaxHeaders(input, sparseHeaders, globalPaxHeaders, currEntry.getSize()); } // for 0.1 PAX Headers if (headers.containsKey(TarGnuSparseKeys.MAP)) { sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get(TarGnuSparseKeys.MAP))); } getNextTarEntry(); // Get the actual file entry if (currEntry == null) { throw new IOException("premature end of tar archive. Didn't find any entry after PAX header."); } applyPaxHeadersToCurrentEntry(headers, sparseHeaders); // for 1.0 PAX Format, the sparse map is stored in the file data block if (currEntry.isPaxGNU1XSparse()) { try (InputStream input = getInputStream(currEntry)) { sparseHeaders = TarUtils.parsePAX1XSparseHeaders(input, recordSize); } currEntry.setSparseHeaders(sparseHeaders); // data of the entry is after the pax gnu entry. So we need to update the data position once again currEntry.setDataOffset(currEntry.getDataOffset() + recordSize); } // sparse headers are all done reading, we need to build // sparse input streams using these sparse headers buildSparseInputStreams(); } private void readGlobalPaxHeaders() throws IOException { try (InputStream input = getInputStream(currEntry)) { globalPaxHeaders = TarUtils.parsePaxHeaders(input, globalSparseHeaders, globalPaxHeaders, currEntry.getSize()); } getNextTarEntry(); // Get the actual file entry if (currEntry == null) { throw new IOException("Error detected parsing the pax header"); } } /** * Adds the sparse chunks from the current entry to the sparse chunks, including any additional sparse entries following the current entry. * * @throws IOException when reading the sparse entry fails */ private void readOldGNUSparse() throws IOException { if (currEntry.isExtended()) { TarArchiveSparseEntry entry; do { final ByteBuffer headerBuf = getRecord(); if (headerBuf == null) { throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag."); } entry = new TarArchiveSparseEntry(headerBuf.array()); currEntry.getSparseHeaders().addAll(entry.getSparseHeaders()); currEntry.setDataOffset(currEntry.getDataOffset() + recordSize); } while (entry.isExtended()); } // sparse headers are all done reading, we need to build // sparse input streams using these sparse headers buildSparseInputStreams(); } /** * Read a record from the input stream and return the data. * * @return The record data or null if EOF has been hit. * @throws IOException if reading from the archive fails */ private ByteBuffer readRecord() throws IOException { recordBuffer.rewind(); final int readNow = archive.read(recordBuffer); if (readNow != recordSize) { return null; } return recordBuffer; } private void repositionForwardBy(final long offset) throws IOException { repositionForwardTo(archive.position() + offset); } private void repositionForwardTo(final long newPosition) throws IOException { final long currPosition = archive.position(); if (newPosition < currPosition) { throw new IOException("trying to move backwards inside of the archive"); } archive.position(newPosition); } protected final void setAtEOF(final boolean b) { hasHitEOF = b; } /** * The last record block should be written at the full size, so skip any additional space used to fill a record after an entry * * @throws IOException when skipping the padding of the record fails */ private void skipRecordPadding() throws IOException { if (!isDirectory() && currEntry.getSize() > 0 && currEntry.getSize() % recordSize != 0) { final long numRecords = currEntry.getSize() / recordSize + 1; final long padding = numRecords * recordSize - currEntry.getSize(); repositionForwardBy(padding); throwExceptionIfPositionIsNotInArchive(); } } /** * Checks if the current position of the SeekableByteChannel is in the archive. * * @throws IOException If the position is not in the archive */ private void throwExceptionIfPositionIsNotInArchive() throws IOException { if (archive.size() < archive.position()) { throw new IOException("Truncated TAR archive"); } } /** * Tries to read the next record resetting the position in the archive if it is not an EOF record. * *

* This is meant to protect against cases where a tar implementation has written only one EOF record when two are expected. Actually this won't help since a * non-conforming implementation likely won't fill full blocks consisting of - by default - ten records either so we probably have already read beyond the * archive anyway. *

* * @throws IOException if reading the record of resetting the position in the archive fails */ private void tryToConsumeSecondEOFRecord() throws IOException { boolean shouldReset = true; try { shouldReset = !isEOFRecord(readRecord()); } finally { if (shouldReset) { archive.position(archive.position() - recordSize); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy