All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.compress.archivers.tar.TarFile Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.apache.commons.compress.archivers.tar;

import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.compress.archivers.zip.ZipEncoding;
import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
import org.apache.commons.compress.utils.ArchiveUtils;
import org.apache.commons.compress.utils.BoundedArchiveInputStream;
import org.apache.commons.compress.utils.BoundedInputStream;
import org.apache.commons.compress.utils.BoundedSeekableByteChannelInputStream;
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel;

/**
 * Provides random access to UNIX archives.
 *
 * @since 1.21
 */
public class TarFile implements Closeable {

    private final class BoundedTarEntryInputStream extends BoundedArchiveInputStream {

        private final SeekableByteChannel channel;

        private final TarArchiveEntry entry;

        private long entryOffset;

        private int currentSparseInputStreamIndex;

        BoundedTarEntryInputStream(final TarArchiveEntry entry, final SeekableByteChannel channel) throws IOException {
            super(entry.getDataOffset(), entry.getRealSize());
            if (channel.size() - entry.getSize() < entry.getDataOffset()) {
                throw new IOException("entry size exceeds archive size");
            }
            this.entry = entry;
            this.channel = channel;
        }

        @Override
        protected int read(final long pos, final ByteBuffer buf) throws IOException {
            if (entryOffset >= entry.getRealSize()) {
                return -1;
            }

            final int totalRead;
            if (entry.isSparse()) {
                totalRead = readSparse(entryOffset, buf, buf.limit());
            } else {
                totalRead = readArchive(pos, buf);
            }

            if (totalRead == -1) {
                if (buf.array().length > 0) {
                    throw new IOException("Truncated TAR archive");
                }
                setAtEOF(true);
            } else {
                entryOffset += totalRead;
                buf.flip();
            }
            return totalRead;
        }

        private int readArchive(final long pos, final ByteBuffer buf) throws IOException {
            channel.position(pos);
            return channel.read(buf);
        }

        private int readSparse(final long pos, final ByteBuffer buf, final int numToRead) throws IOException {
            // if there are no actual input streams, just read from the original archive
            final List entrySparseInputStreams = sparseInputStreams.get(entry.getName());
            if (entrySparseInputStreams == null || entrySparseInputStreams.isEmpty()) {
                return readArchive(entry.getDataOffset() + pos, buf);
            }

            if (currentSparseInputStreamIndex >= entrySparseInputStreams.size()) {
                return -1;
            }

            final InputStream currentInputStream = entrySparseInputStreams.get(currentSparseInputStreamIndex);
            final byte[] bufArray = new byte[numToRead];
            final int readLen = currentInputStream.read(bufArray);
            if (readLen != -1) {
                buf.put(bufArray, 0, readLen);
            }

            // if the current input stream is the last input stream,
            // just return the number of bytes read from current input stream
            if (currentSparseInputStreamIndex == entrySparseInputStreams.size() - 1) {
                return readLen;
            }

            // if EOF of current input stream is meet, open a new input stream and recursively call read
            if (readLen == -1) {
                currentSparseInputStreamIndex++;
                return readSparse(pos, buf, numToRead);
            }

            // if the rest data of current input stream is not long enough, open a new input stream
            // and recursively call read
            if (readLen < numToRead) {
                currentSparseInputStreamIndex++;
                final int readLenOfNext = readSparse(pos + readLen, buf, numToRead - readLen);
                if (readLenOfNext == -1) {
                    return readLen;
                }

                return readLen + readLenOfNext;
            }

            // if the rest data of current input stream is enough(which means readLen == len), just return readLen
            return readLen;
        }
    }

    private static final int SMALL_BUFFER_SIZE = 256;

    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];

    private final SeekableByteChannel archive;

    /**
     * The encoding of the tar file
     */
    private final ZipEncoding zipEncoding;

    private final LinkedList entries = new LinkedList<>();

    private final int blockSize;

    private final boolean lenient;

    private final int recordSize;

    private final ByteBuffer recordBuffer;

    // the global sparse headers, this is only used in PAX Format 0.X
    private final List globalSparseHeaders = new ArrayList<>();

    private boolean hasHitEOF;

    /**
     * The meta-data about the current entry
     */
    private TarArchiveEntry currEntry;

    // the global PAX header
    private Map globalPaxHeaders = new HashMap<>();

    private final Map> sparseInputStreams = new HashMap<>();

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content) throws IOException {
        this(new SeekableInMemoryByteChannel(content));
    }

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content, final boolean lenient) throws IOException {
        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param content  the content to use
     * @param encoding the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content, final String encoding) throws IOException {
        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive the file of the archive to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive) throws IOException {
        this(archive.toPath());
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive the file of the archive to use
     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive, final boolean lenient) throws IOException {
        this(archive.toPath(), lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive  the file of the archive to use
     * @param encoding the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive, final String encoding) throws IOException {
        this(archive.toPath(), encoding);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                    ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                    exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath, final boolean lenient) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @param encoding    the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath, final String encoding) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final SeekableByteChannel content) throws IOException {
        this(content, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive    the seekable byte channel to use
     * @param blockSize  the blocks size to use
     * @param recordSize the record size to use
     * @param encoding   the encoding to use
     * @param lenient    when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                   ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                   exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final SeekableByteChannel archive, final int blockSize, final int recordSize, final String encoding, final boolean lenient) throws IOException {
        this.archive = archive;
        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
        this.recordSize = recordSize;
        this.recordBuffer = ByteBuffer.allocate(this.recordSize);
        this.blockSize = blockSize;
        this.lenient = lenient;

        TarArchiveEntry entry;
        while ((entry = getNextTarEntry()) != null) {
            entries.add(entry);
        }
    }

    /**
     * Update the current entry with the read pax headers
     * @param headers Headers read from the pax header
     * @param sparseHeaders Sparse headers read from pax header
     */
    private void applyPaxHeadersToCurrentEntry(final Map headers, final List sparseHeaders)
        throws IOException {
        currEntry.updateEntryFromPaxHeaders(headers);
        currEntry.setSparseHeaders(sparseHeaders);
    }

    /**
     * Build the input streams consisting of all-zero input streams and non-zero input streams.
     * When reading from the non-zero input streams, the data is actually read from the original input stream.
     * The size of each input stream is introduced by the sparse headers.
     *
     * @implNote Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the
     *        0 size input streams because they are meaningless.
     */
    private void buildSparseInputStreams() throws IOException {
        final List streams = new ArrayList<>();

        final List sparseHeaders = currEntry.getOrderedSparseHeaders();

        // Stream doesn't need to be closed at all as it doesn't use any resources
        final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); //NOSONAR
        // logical offset into the extracted entry
        long offset = 0;
        long numberOfZeroBytesInSparseEntry = 0;
        for (final TarArchiveStructSparse sparseHeader : sparseHeaders) {
            final long zeroBlockSize = sparseHeader.getOffset() - offset;
            if (zeroBlockSize < 0) {
                // sparse header says to move backwards inside the extracted entry
                throw new IOException("Corrupted struct sparse detected");
            }

            // only store the zero block if it is not empty
            if (zeroBlockSize > 0) {
                streams.add(new BoundedInputStream(zeroInputStream, zeroBlockSize));
                numberOfZeroBytesInSparseEntry += zeroBlockSize;
            }

            // only store the input streams with non-zero size
            if (sparseHeader.getNumbytes() > 0) {
                final long start =
                    currEntry.getDataOffset() + sparseHeader.getOffset() - numberOfZeroBytesInSparseEntry;
                if (start + sparseHeader.getNumbytes() < start) {
                    // possible integer overflow
                    throw new IOException("Unreadable TAR archive, sparse block offset or length too big");
                }
                streams.add(new BoundedSeekableByteChannelInputStream(start, sparseHeader.getNumbytes(), archive));
            }

            offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
        }

        sparseInputStreams.put(currEntry.getName(), streams);
    }

    @Override
    public void close() throws IOException {
        archive.close();
    }

    /**
     * This method is invoked once the end of the archive is hit, it
     * tries to consume the remaining bytes under the assumption that
     * the tool creating this archive has padded the last block.
     */
    private void consumeRemainderOfLastBlock() throws IOException {
        final long bytesReadOfLastBlock = archive.position() % blockSize;
        if (bytesReadOfLastBlock > 0) {
            repositionForwardBy(blockSize - bytesReadOfLastBlock);
        }
    }

    /**
     * Get all TAR Archive Entries from the TarFile
     *
     * @return All entries from the tar file
     */
    public List getEntries() {
        return new ArrayList<>(entries);
    }

    /**
     * Gets the input stream for the provided Tar Archive Entry.
     * @param entry Entry to get the input stream from
     * @return Input stream of the provided entry
     * @throws IOException Corrupted TAR archive. Can't read entry.
     */
    public InputStream getInputStream(final TarArchiveEntry entry) throws IOException {
        try {
            return new BoundedTarEntryInputStream(entry, archive);
        } catch (final RuntimeException ex) {
            throw new IOException("Corrupted TAR archive. Can't read entry", ex);
        }
    }

    /**
     * Get the next entry in this tar archive as longname data.
     *
     * @return The next entry in the archive as longname data, or null.
     * @throws IOException on error
     */
    private byte[] getLongNameData() throws IOException {
        final ByteArrayOutputStream longName = new ByteArrayOutputStream();
        int length;
        try (final InputStream in = getInputStream(currEntry)) {
            while ((length = in.read(smallBuf)) >= 0) {
                longName.write(smallBuf, 0, length);
            }
        }
        getNextTarEntry();
        if (currEntry == null) {
            // Bugzilla: 40334
            // Malformed tar file - long entry name not followed by entry
            return null;
        }
        byte[] longNameData = longName.toByteArray();
        // remove trailing null terminator(s)
        length = longNameData.length;
        while (length > 0 && longNameData[length - 1] == 0) {
            --length;
        }
        if (length != longNameData.length) {
            longNameData = Arrays.copyOf(longNameData, length);
        }
        return longNameData;
    }

    /**
     * Get the next entry in this tar archive. This will skip
     * to the end of the current entry, if there is one, and
     * place the position of the channel at the header of the
     * next entry, and read the header and instantiate a new
     * TarEntry from the header bytes and return that entry.
     * If there are no more entries in the archive, null will
     * be returned to indicate that the end of the archive has
     * been reached.
     *
     * @return The next TarEntry in the archive, or null if there is no next entry.
     * @throws IOException when reading the next TarEntry fails
     */
    private TarArchiveEntry getNextTarEntry() throws IOException {
        if (isAtEOF()) {
            return null;
        }

        if (currEntry != null) {
            // Skip to the end of the entry
            repositionForwardTo(currEntry.getDataOffset() + currEntry.getSize());
            throwExceptionIfPositionIsNotInArchive();
            skipRecordPadding();
        }

        final ByteBuffer headerBuf = getRecord();
        if (null == headerBuf) {
            /* hit EOF */
            currEntry = null;
            return null;
        }

        try {
            final long position = archive.position();
            currEntry = new TarArchiveEntry(globalPaxHeaders, headerBuf.array(), zipEncoding, lenient, position);
        } catch (final IllegalArgumentException e) {
            throw new IOException("Error detected parsing the header", e);
        }

        if (currEntry.isGNULongLinkEntry()) {
            final byte[] longLinkData = getLongNameData();
            if (longLinkData == null) {
                // Bugzilla: 40334
                // Malformed tar file - long link entry name not followed by
                // entry
                return null;
            }
            currEntry.setLinkName(zipEncoding.decode(longLinkData));
        }

        if (currEntry.isGNULongNameEntry()) {
            final byte[] longNameData = getLongNameData();
            if (longNameData == null) {
                // Bugzilla: 40334
                // Malformed tar file - long entry name not followed by
                // entry
                return null;
            }

            // COMPRESS-509 : the name of directories should end with '/'
            final String name = zipEncoding.decode(longNameData);
            currEntry.setName(name);
            if (currEntry.isDirectory() && !name.endsWith("/")) {
                currEntry.setName(name + "/");
            }
        }

        if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
            readGlobalPaxHeaders();
        }

        try {
            if (currEntry.isPaxHeader()) { // Process Pax headers
                paxHeaders();
            } else if (!globalPaxHeaders.isEmpty()) {
                applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
            }
        } catch (final NumberFormatException e) {
            throw new IOException("Error detected parsing the pax header", e);
        }

        if (currEntry.isOldGNUSparse()) { // Process sparse files
            readOldGNUSparse();
        }

        return currEntry;
    }

    /**
     * Get the next record in this tar archive. This will skip
     * over any remaining data in the current entry, if there
     * is one, and place the input stream at the header of the
     * next entry.
     *
     * 

If there are no more entries in the archive, null will be * returned to indicate that the end of the archive has been * reached. At the same time the {@code hasHitEOF} marker will be * set to true.

* * @return The next TarEntry in the archive, or null if there is no next entry. * @throws IOException when reading the next TarEntry fails */ private ByteBuffer getRecord() throws IOException { ByteBuffer headerBuf = readRecord(); setAtEOF(isEOFRecord(headerBuf)); if (isAtEOF() && headerBuf != null) { // Consume rest tryToConsumeSecondEOFRecord(); consumeRemainderOfLastBlock(); headerBuf = null; } return headerBuf; } protected final boolean isAtEOF() { return hasHitEOF; } private boolean isDirectory() { return currEntry != null && currEntry.isDirectory(); } private boolean isEOFRecord(final ByteBuffer headerBuf) { return headerBuf == null || ArchiveUtils.isArrayZero(headerBuf.array(), recordSize); } /** *

* For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) * may appear multi times, and they look like: *

     * GNU.sparse.size=size
     * GNU.sparse.numblocks=numblocks
     * repeat numblocks times
     *   GNU.sparse.offset=offset
     *   GNU.sparse.numbytes=numbytes
     * end repeat
     * 
* *

* For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map *

     * GNU.sparse.map
     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
     * 
* *

* For PAX Format 1.X: *
* The sparse map itself is stored in the file data block, preceding the actual file data. * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers * giving the offset and size of the data block it describes. * @throws IOException */ private void paxHeaders() throws IOException { List sparseHeaders = new ArrayList<>(); final Map headers; try (final InputStream input = getInputStream(currEntry)) { headers = TarUtils.parsePaxHeaders(input, sparseHeaders, globalPaxHeaders, currEntry.getSize()); } // for 0.1 PAX Headers if (headers.containsKey(TarGnuSparseKeys.MAP)) { sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get(TarGnuSparseKeys.MAP))); } getNextTarEntry(); // Get the actual file entry if (currEntry == null) { throw new IOException("premature end of tar archive. Didn't find any entry after PAX header."); } applyPaxHeadersToCurrentEntry(headers, sparseHeaders); // for 1.0 PAX Format, the sparse map is stored in the file data block if (currEntry.isPaxGNU1XSparse()) { try (final InputStream input = getInputStream(currEntry)) { sparseHeaders = TarUtils.parsePAX1XSparseHeaders(input, recordSize); } currEntry.setSparseHeaders(sparseHeaders); // data of the entry is after the pax gnu entry. So we need to update the data position once again currEntry.setDataOffset(currEntry.getDataOffset() + recordSize); } // sparse headers are all done reading, we need to build // sparse input streams using these sparse headers buildSparseInputStreams(); } private void readGlobalPaxHeaders() throws IOException { try (InputStream input = getInputStream(currEntry)) { globalPaxHeaders = TarUtils.parsePaxHeaders(input, globalSparseHeaders, globalPaxHeaders, currEntry.getSize()); } getNextTarEntry(); // Get the actual file entry if (currEntry == null) { throw new IOException("Error detected parsing the pax header"); } } /** * Adds the sparse chunks from the current entry to the sparse chunks, * including any additional sparse entries following the current entry. * * @throws IOException when reading the sparse entry fails */ private void readOldGNUSparse() throws IOException { if (currEntry.isExtended()) { TarArchiveSparseEntry entry; do { final ByteBuffer headerBuf = getRecord(); if (headerBuf == null) { throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag."); } entry = new TarArchiveSparseEntry(headerBuf.array()); currEntry.getSparseHeaders().addAll(entry.getSparseHeaders()); currEntry.setDataOffset(currEntry.getDataOffset() + recordSize); } while (entry.isExtended()); } // sparse headers are all done reading, we need to build // sparse input streams using these sparse headers buildSparseInputStreams(); } /** * Read a record from the input stream and return the data. * * @return The record data or null if EOF has been hit. * @throws IOException if reading from the archive fails */ private ByteBuffer readRecord() throws IOException { recordBuffer.rewind(); final int readNow = archive.read(recordBuffer); if (readNow != recordSize) { return null; } return recordBuffer; } private void repositionForwardBy(final long offset) throws IOException { repositionForwardTo(archive.position() + offset); } private void repositionForwardTo(final long newPosition) throws IOException { final long currPosition = archive.position(); if (newPosition < currPosition) { throw new IOException("trying to move backwards inside of the archive"); } archive.position(newPosition); } protected final void setAtEOF(final boolean b) { hasHitEOF = b; } /** * The last record block should be written at the full size, so skip any * additional space used to fill a record after an entry * * @throws IOException when skipping the padding of the record fails */ private void skipRecordPadding() throws IOException { if (!isDirectory() && currEntry.getSize() > 0 && currEntry.getSize() % recordSize != 0) { final long numRecords = (currEntry.getSize() / recordSize) + 1; final long padding = (numRecords * recordSize) - currEntry.getSize(); repositionForwardBy(padding); throwExceptionIfPositionIsNotInArchive(); } } /** * Checks if the current position of the SeekableByteChannel is in the archive. * @throws IOException If the position is not in the archive */ private void throwExceptionIfPositionIsNotInArchive() throws IOException { if (archive.size() < archive.position()) { throw new IOException("Truncated TAR archive"); } } /** * Tries to read the next record resetting the position in the * archive if it is not an EOF record. * *

This is meant to protect against cases where a tar * implementation has written only one EOF record when two are * expected. Actually this won't help since a non-conforming * implementation likely won't fill full blocks consisting of - by * default - ten records either so we probably have already read * beyond the archive anyway.

* * @throws IOException if reading the record of resetting the position in the archive fails */ private void tryToConsumeSecondEOFRecord() throws IOException { boolean shouldReset = true; try { shouldReset = !isEOFRecord(readRecord()); } finally { if (shouldReset) { archive.position(archive.position() - recordSize); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy