All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.compress.archivers.tar.TarFile Maven / Gradle / Ivy

/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */
package org.apache.commons.compress.archivers.tar;

import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.compress.archivers.zip.ZipEncoding;
import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
import org.apache.commons.compress.utils.ArchiveUtils;
import org.apache.commons.compress.utils.BoundedInputStream;
import org.apache.commons.compress.utils.BoundedArchiveInputStream;
import org.apache.commons.compress.utils.BoundedSeekableByteChannelInputStream;
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel;

/**
 * The TarFile provides random access to UNIX archives.
 * @since 1.21
 */
public class TarFile implements Closeable {

    private static final int SMALL_BUFFER_SIZE = 256;

    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];

    private final SeekableByteChannel archive;

    /**
     * The encoding of the tar file
     */
    private final ZipEncoding zipEncoding;

    private final LinkedList entries = new LinkedList<>();

    private final int blockSize;

    private final boolean lenient;

    private final int recordSize;

    private final ByteBuffer recordBuffer;

    // the global sparse headers, this is only used in PAX Format 0.X
    private final List globalSparseHeaders = new ArrayList<>();

    private boolean hasHitEOF;

    /**
     * The meta-data about the current entry
     */
    private TarArchiveEntry currEntry;

    // the global PAX header
    private Map globalPaxHeaders = new HashMap<>();

    private final Map> sparseInputStreams = new HashMap<>();

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content) throws IOException {
        this(new SeekableInMemoryByteChannel(content));
    }

    /**
     * Constructor for TarFile.
     *
     * @param content  the content to use
     * @param encoding the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content, final String encoding) throws IOException {
        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final byte[] content, final boolean lenient) throws IOException {
        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive the file of the archive to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive) throws IOException {
        this(archive.toPath());
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive  the file of the archive to use
     * @param encoding the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive, final String encoding) throws IOException {
        this(archive.toPath(), encoding);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive the file of the archive to use
     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final File archive, final boolean lenient) throws IOException {
        this(archive.toPath(), lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @param encoding    the encoding to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath, final String encoding) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archivePath the path of the archive to use
     * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                    ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                    exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final Path archivePath, final boolean lenient) throws IOException {
        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
    }

    /**
     * Constructor for TarFile.
     *
     * @param content the content to use
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final SeekableByteChannel content) throws IOException {
        this(content, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
    }

    /**
     * Constructor for TarFile.
     *
     * @param archive    the seekable byte channel to use
     * @param blockSize  the blocks size to use
     * @param recordSize the record size to use
     * @param encoding   the encoding to use
     * @param lenient    when set to true illegal values for group/userid, mode, device numbers and timestamp will be
     *                   ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
     *                   exception instead.
     * @throws IOException when reading the tar archive fails
     */
    public TarFile(final SeekableByteChannel archive, final int blockSize, final int recordSize, final String encoding, final boolean lenient) throws IOException {
        this.archive = archive;
        this.hasHitEOF = false;
        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
        this.recordSize = recordSize;
        this.recordBuffer = ByteBuffer.allocate(this.recordSize);
        this.blockSize = blockSize;
        this.lenient = lenient;

        TarArchiveEntry entry;
        while ((entry = getNextTarEntry()) != null) {
            entries.add(entry);
        }
    }

    /**
     * Get the next entry in this tar archive. This will skip
     * to the end of the current entry, if there is one, and
     * place the position of the channel at the header of the
     * next entry, and read the header and instantiate a new
     * TarEntry from the header bytes and return that entry.
     * If there are no more entries in the archive, null will
     * be returned to indicate that the end of the archive has
     * been reached.
     *
     * @return The next TarEntry in the archive, or null if there is no next entry.
     * @throws IOException when reading the next TarEntry fails
     */
    private TarArchiveEntry getNextTarEntry() throws IOException {
        if (isAtEOF()) {
            return null;
        }

        if (currEntry != null) {
            // Skip to the end of the entry
            repositionForwardTo(currEntry.getDataOffset() + currEntry.getSize());
            throwExceptionIfPositionIsNotInArchive();
            skipRecordPadding();
        }

        final ByteBuffer headerBuf = getRecord();
        if (null == headerBuf) {
            /* hit EOF */
            currEntry = null;
            return null;
        }

        try {
            currEntry = new TarArchiveEntry(headerBuf.array(), zipEncoding, lenient, archive.position());
        } catch (final IllegalArgumentException e) {
            throw new IOException("Error detected parsing the header", e);
        }

        if (currEntry.isGNULongLinkEntry()) {
            final byte[] longLinkData = getLongNameData();
            if (longLinkData == null) {
                // Bugzilla: 40334
                // Malformed tar file - long link entry name not followed by
                // entry
                return null;
            }
            currEntry.setLinkName(zipEncoding.decode(longLinkData));
        }

        if (currEntry.isGNULongNameEntry()) {
            final byte[] longNameData = getLongNameData();
            if (longNameData == null) {
                // Bugzilla: 40334
                // Malformed tar file - long entry name not followed by
                // entry
                return null;
            }

            // COMPRESS-509 : the name of directories should end with '/'
            final String name = zipEncoding.decode(longNameData);
            currEntry.setName(name);
            if (currEntry.isDirectory() && !name.endsWith("/")) {
                currEntry.setName(name + "/");
            }
        }

        if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
            readGlobalPaxHeaders();
        }

        try {
            if (currEntry.isPaxHeader()) { // Process Pax headers
                paxHeaders();
            } else if (!globalPaxHeaders.isEmpty()) {
                applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
            }
        } catch (NumberFormatException e) {
            throw new IOException("Error detected parsing the pax header", e);
        }

        if (currEntry.isOldGNUSparse()) { // Process sparse files
            readOldGNUSparse();
        }

        return currEntry;
    }

    /**
     * Adds the sparse chunks from the current entry to the sparse chunks,
     * including any additional sparse entries following the current entry.
     *
     * @throws IOException when reading the sparse entry fails
     */
    private void readOldGNUSparse() throws IOException {
        if (currEntry.isExtended()) {
            TarArchiveSparseEntry entry;
            do {
                final ByteBuffer headerBuf = getRecord();
                if (headerBuf == null) {
                    throw new IOException("premature end of tar archive. Didn't find extended_header after header with extended flag.");
                }
                entry = new TarArchiveSparseEntry(headerBuf.array());
                currEntry.getSparseHeaders().addAll(entry.getSparseHeaders());
                currEntry.setDataOffset(currEntry.getDataOffset() + recordSize);
            } while (entry.isExtended());
        }

        // sparse headers are all done reading, we need to build
        // sparse input streams using these sparse headers
        buildSparseInputStreams();
    }

    /**
     * Build the input streams consisting of all-zero input streams and non-zero input streams.
     * When reading from the non-zero input streams, the data is actually read from the original input stream.
     * The size of each input stream is introduced by the sparse headers.
     *
     * @implNote Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the
     *        0 size input streams because they are meaningless.
     */
    private void buildSparseInputStreams() throws IOException {
        final List streams = new ArrayList<>();

        final List sparseHeaders = currEntry.getOrderedSparseHeaders();

        // Stream doesn't need to be closed at all as it doesn't use any resources
        final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); //NOSONAR
        // logical offset into the extracted entry
        long offset = 0;
        long numberOfZeroBytesInSparseEntry = 0;
        for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
            final long zeroBlockSize = sparseHeader.getOffset() - offset;
            if (zeroBlockSize < 0) {
                // sparse header says to move backwards inside of the extracted entry
                throw new IOException("Corrupted struct sparse detected");
            }

            // only store the zero block if it is not empty
            if (zeroBlockSize > 0) {
                streams.add(new BoundedInputStream(zeroInputStream, zeroBlockSize));
                numberOfZeroBytesInSparseEntry += zeroBlockSize;
            }

            // only store the input streams with non-zero size
            if (sparseHeader.getNumbytes() > 0) {
                final long start =
                    currEntry.getDataOffset() + sparseHeader.getOffset() - numberOfZeroBytesInSparseEntry;
                if (start + sparseHeader.getNumbytes() < start) {
                    // possible integer overflow
                    throw new IOException("Unreadable TAR archive, sparse block offset or length too big");
                }
                streams.add(new BoundedSeekableByteChannelInputStream(start, sparseHeader.getNumbytes(), archive));
            }

            offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
        }

        sparseInputStreams.put(currEntry.getName(), streams);
    }

    /**
     * Update the current entry with the read pax headers
     * @param headers Headers read from the pax header
     * @param sparseHeaders Sparse headers read from pax header
     */
    private void applyPaxHeadersToCurrentEntry(final Map headers, final List sparseHeaders)
        throws IOException {
        currEntry.updateEntryFromPaxHeaders(headers);
        currEntry.setSparseHeaders(sparseHeaders);
    }

    /**
     * 

* For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) * may appear multi times, and they look like: *

     * GNU.sparse.size=size
     * GNU.sparse.numblocks=numblocks
     * repeat numblocks times
     *   GNU.sparse.offset=offset
     *   GNU.sparse.numbytes=numbytes
     * end repeat
     * 
* *

* For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map *

     * GNU.sparse.map
     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
     * 
* *

* For PAX Format 1.X: *
* The sparse map itself is stored in the file data block, preceding the actual file data. * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers * giving the offset and size of the data block it describes. * @throws IOException */ private void paxHeaders() throws IOException { List sparseHeaders = new ArrayList<>(); final Map headers; try (final InputStream input = getInputStream(currEntry)) { headers = TarUtils.parsePaxHeaders(input, sparseHeaders, globalPaxHeaders, currEntry.getSize()); } // for 0.1 PAX Headers if (headers.containsKey("GNU.sparse.map")) { sparseHeaders = new ArrayList<>(TarUtils.parseFromPAX01SparseHeaders(headers.get("GNU.sparse.map"))); } getNextTarEntry(); // Get the actual file entry if (currEntry == null) { throw new IOException("premature end of tar archive. Didn't find any entry after PAX header."); } applyPaxHeadersToCurrentEntry(headers, sparseHeaders); // for 1.0 PAX Format, the sparse map is stored in the file data block if (currEntry.isPaxGNU1XSparse()) { try (final InputStream input = getInputStream(currEntry)) { sparseHeaders = TarUtils.parsePAX1XSparseHeaders(input, recordSize); } currEntry.setSparseHeaders(sparseHeaders); // data of the entry is after the pax gnu entry. So we need to update the data position once again currEntry.setDataOffset(currEntry.getDataOffset() + recordSize); } // sparse headers are all done reading, we need to build // sparse input streams using these sparse headers buildSparseInputStreams(); } private void readGlobalPaxHeaders() throws IOException { try (InputStream input = getInputStream(currEntry)) { globalPaxHeaders = TarUtils.parsePaxHeaders(input, globalSparseHeaders, globalPaxHeaders, currEntry.getSize()); } getNextTarEntry(); // Get the actual file entry if (currEntry == null) { throw new IOException("Error detected parsing the pax header"); } } /** * Get the next entry in this tar archive as longname data. * * @return The next entry in the archive as longname data, or null. * @throws IOException on error */ private byte[] getLongNameData() throws IOException { final ByteArrayOutputStream longName = new ByteArrayOutputStream(); int length; try (final InputStream in = getInputStream(currEntry)) { while ((length = in.read(smallBuf)) >= 0) { longName.write(smallBuf, 0, length); } } getNextTarEntry(); if (currEntry == null) { // Bugzilla: 40334 // Malformed tar file - long entry name not followed by entry return null; } byte[] longNameData = longName.toByteArray(); // remove trailing null terminator(s) length = longNameData.length; while (length > 0 && longNameData[length - 1] == 0) { --length; } if (length != longNameData.length) { final byte[] l = new byte[length]; System.arraycopy(longNameData, 0, l, 0, length); longNameData = l; } return longNameData; } /** * The last record block should be written at the full size, so skip any * additional space used to fill a record after an entry * * @throws IOException when skipping the padding of the record fails */ private void skipRecordPadding() throws IOException { if (!isDirectory() && currEntry.getSize() > 0 && currEntry.getSize() % recordSize != 0) { final long numRecords = (currEntry.getSize() / recordSize) + 1; final long padding = (numRecords * recordSize) - currEntry.getSize(); repositionForwardBy(padding); throwExceptionIfPositionIsNotInArchive(); } } private void repositionForwardTo(final long newPosition) throws IOException { final long currPosition = archive.position(); if (newPosition < currPosition) { throw new IOException("trying to move backwards inside of the archive"); } archive.position(newPosition); } private void repositionForwardBy(final long offset) throws IOException { repositionForwardTo(archive.position() + offset); } /** * Checks if the current position of the SeekableByteChannel is in the archive. * @throws IOException If the position is not in the archive */ private void throwExceptionIfPositionIsNotInArchive() throws IOException { if (archive.size() < archive.position()) { throw new IOException("Truncated TAR archive"); } } /** * Get the next record in this tar archive. This will skip * over any remaining data in the current entry, if there * is one, and place the input stream at the header of the * next entry. * *

If there are no more entries in the archive, null will be * returned to indicate that the end of the archive has been * reached. At the same time the {@code hasHitEOF} marker will be * set to true.

* * @return The next TarEntry in the archive, or null if there is no next entry. * @throws IOException when reading the next TarEntry fails */ private ByteBuffer getRecord() throws IOException { ByteBuffer headerBuf = readRecord(); setAtEOF(isEOFRecord(headerBuf)); if (isAtEOF() && headerBuf != null) { // Consume rest tryToConsumeSecondEOFRecord(); consumeRemainderOfLastBlock(); headerBuf = null; } return headerBuf; } /** * Tries to read the next record resetting the position in the * archive if it is not a EOF record. * *

This is meant to protect against cases where a tar * implementation has written only one EOF record when two are * expected. Actually this won't help since a non-conforming * implementation likely won't fill full blocks consisting of - by * default - ten records either so we probably have already read * beyond the archive anyway.

* * @throws IOException if reading the record of resetting the position in the archive fails */ private void tryToConsumeSecondEOFRecord() throws IOException { boolean shouldReset = true; try { shouldReset = !isEOFRecord(readRecord()); } finally { if (shouldReset) { archive.position(archive.position() - recordSize); } } } /** * This method is invoked once the end of the archive is hit, it * tries to consume the remaining bytes under the assumption that * the tool creating this archive has padded the last block. */ private void consumeRemainderOfLastBlock() throws IOException { final long bytesReadOfLastBlock = archive.position() % blockSize; if (bytesReadOfLastBlock > 0) { repositionForwardBy(blockSize - bytesReadOfLastBlock); } } /** * Read a record from the input stream and return the data. * * @return The record data or null if EOF has been hit. * @throws IOException if reading from the archive fails */ private ByteBuffer readRecord() throws IOException { recordBuffer.rewind(); final int readNow = archive.read(recordBuffer); if (readNow != recordSize) { return null; } return recordBuffer; } /** * Get all TAR Archive Entries from the TarFile * * @return All entries from the tar file */ public List getEntries() { return new ArrayList<>(entries); } private boolean isEOFRecord(final ByteBuffer headerBuf) { return headerBuf == null || ArchiveUtils.isArrayZero(headerBuf.array(), recordSize); } protected final boolean isAtEOF() { return hasHitEOF; } protected final void setAtEOF(final boolean b) { hasHitEOF = b; } private boolean isDirectory() { return currEntry != null && currEntry.isDirectory(); } /** * Gets the input stream for the provided Tar Archive Entry. * @param entry Entry to get the input stream from * @return Input stream of the provided entry * @throws IOException Corrupted TAR archive. Can't read entry. */ public InputStream getInputStream(final TarArchiveEntry entry) throws IOException { try { return new BoundedTarEntryInputStream(entry, archive); } catch (RuntimeException ex) { throw new IOException("Corrupted TAR archive. Can't read entry", ex); } } @Override public void close() throws IOException { archive.close(); } private final class BoundedTarEntryInputStream extends BoundedArchiveInputStream { private final SeekableByteChannel channel; private final TarArchiveEntry entry; private long entryOffset; private int currentSparseInputStreamIndex; BoundedTarEntryInputStream(final TarArchiveEntry entry, final SeekableByteChannel channel) throws IOException { super(entry.getDataOffset(), entry.getRealSize()); if (channel.size() - entry.getSize() < entry.getDataOffset()) { throw new IOException("entry size exceeds archive size"); } this.entry = entry; this.channel = channel; } @Override protected int read(final long pos, final ByteBuffer buf) throws IOException { if (entryOffset >= entry.getRealSize()) { return -1; } final int totalRead; if (entry.isSparse()) { totalRead = readSparse(entryOffset, buf, buf.limit()); } else { totalRead = readArchive(pos, buf); } if (totalRead == -1) { if (buf.array().length > 0) { throw new IOException("Truncated TAR archive"); } setAtEOF(true); } else { entryOffset += totalRead; buf.flip(); } return totalRead; } private int readSparse(final long pos, final ByteBuffer buf, final int numToRead) throws IOException { // if there are no actual input streams, just read from the original archive final List entrySparseInputStreams = sparseInputStreams.get(entry.getName()); if (entrySparseInputStreams == null || entrySparseInputStreams.isEmpty()) { return readArchive(entry.getDataOffset() + pos, buf); } if (currentSparseInputStreamIndex >= entrySparseInputStreams.size()) { return -1; } final InputStream currentInputStream = entrySparseInputStreams.get(currentSparseInputStreamIndex); final byte[] bufArray = new byte[numToRead]; final int readLen = currentInputStream.read(bufArray); if (readLen != -1) { buf.put(bufArray, 0, readLen); } // if the current input stream is the last input stream, // just return the number of bytes read from current input stream if (currentSparseInputStreamIndex == entrySparseInputStreams.size() - 1) { return readLen; } // if EOF of current input stream is meet, open a new input stream and recursively call read if (readLen == -1) { currentSparseInputStreamIndex++; return readSparse(pos, buf, numToRead); } // if the rest data of current input stream is not long enough, open a new input stream // and recursively call read if (readLen < numToRead) { currentSparseInputStreamIndex++; final int readLenOfNext = readSparse(pos + readLen, buf, numToRead - readLen); if (readLenOfNext == -1) { return readLen; } return readLen + readLenOfNext; } // if the rest data of current input stream is enough(which means readLen == len), just return readLen return readLen; } private int readArchive(final long pos, final ByteBuffer buf) throws IOException { channel.position(pos); return channel.read(buf); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy