All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sansa_stack.hadoop.core.SeekableSourceOverSplit Maven / Gradle / Ivy

package net.sansa_stack.hadoop.core;


import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import org.aksw.commons.io.buffer.array.ArrayOps;
import org.aksw.commons.io.buffer.array.BufferOverReadableChannel;
import org.aksw.commons.io.hadoop.SeekableInputStream;
import org.aksw.commons.io.hadoop.SeekableInputStreams;
import org.aksw.commons.io.input.ReadableChannel;
import org.aksw.commons.io.input.ReadableChannelWithConditionalBound;
import org.aksw.commons.io.input.ReadableChannels;
import org.aksw.commons.io.input.SeekableReadableChannel;
import org.aksw.commons.io.input.SeekableReadableChannelBase;
import org.aksw.commons.io.input.SeekableReadableChannelSource;
import org.aksw.commons.io.input.SeekableReadableChannelWithLimit;
import org.aksw.commons.io.input.SeekableReadableChannels;
import org.aksw.commons.util.lock.LockUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Seekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.primitives.Ints;

import net.sansa_stack.hadoop.util.DeferredSeekablePushbackInputStream;

public class SeekableSourceOverSplit
    implements SeekableReadableChannelSource, Closeable
{
    private static final Logger logger = LoggerFactory.getLogger(SeekableSourceOverSplit.class);
    	
    @Override
    public void close() throws IOException {
        if (headBuffer.getDataSupplier() != null && headBuffer.getDataSupplier().isOpen()) {
            headBuffer.getDataSupplier().close();
        }
        if (tailBuffer.getDataSupplier() != null && tailBuffer.getDataSupplier().isOpen()) {
            tailBuffer.getDataSupplier().close();
        }
        if (postambleBuffer.getDataSupplier() != null&& postambleBuffer.getDataSupplier().isOpen()) {
            postambleBuffer.getDataSupplier().close();
        }
        if (debufferedHead != null) {
            debufferedHead.close();
        }
        baseStream.close();
    }
    // protected SeekableReadableChannel base;

    /** The total number of bytes that need to be read from base until the split boundary is reached.
     * A value of -1 indicates unknown. For non-encoded streams this is simply the length of the split. */
    // protected long knownDecodedDataLength; // [] = new long[]{ isEncoded ? -1 : splitLength };

    // The head stream has a conditional bound at the split end
    protected ReadableChannel baseStream;

    protected BufferOverReadableChannel headBuffer;
    protected BufferOverReadableChannel tailBuffer;

    /**
     * The postamble buffer is only served if a limit is set via {@link Channel#setLimit(long)}
     * If no limit is set then the remainder of the stream is consumed which is assumed to include the postamble
     */
    protected BufferOverReadableChannel postambleBuffer;


    protected SeekableReadableChannel debufferedHead;


    /* A later stream with the same offset overrides a prior one (implies that the prior one was empty). */
    protected NavigableMap posToIndex = new TreeMap<>();

    protected NavigableMap absPosToBlockOffset = null;

//    protected boolean isEndReached = false;


    public long getBlockForPos(long pos) {
        Map.Entry e = absPosToBlockOffset.floorEntry(pos);
        // absPosToBlockOffset.headMap(pos, true).size();
        return e.getValue();
    }

//    public boolean isEndReached() {
//    	return isEndReached;
//    }

    public long getKnownSize() {
        Entry offsetAndBufferId = posToIndex.lastEntry();
        long bufferSize = getBufferByIndexUnsafe(offsetAndBufferId.getValue()).getKnownDataSize();

        long result = offsetAndBufferId.getKey() + bufferSize;
        return result;
    }

    /** If true then the headStream can no longer be used. */
    // protected boolean isHeadDebuffered;

    public SeekableSourceOverSplit(ReadableChannel baseStream, BufferOverReadableChannel headBuffer, BufferOverReadableChannel tailBuffer, BufferOverReadableChannel postambleBuffer, NavigableMap absPosToBlockOffset) {
        super();
        this.baseStream = baseStream;
        this.headBuffer = headBuffer;
        this.tailBuffer = tailBuffer;
        this.postambleBuffer = postambleBuffer;
        this.absPosToBlockOffset = absPosToBlockOffset;
        this.posToIndex.put(0l, 0);
    }

    /**
     * @return null if the underlying stream is not based on blocks; otherwise a map of byte-offsets (staring from zero) to block offsets
     */
    public NavigableMap getAbsPosToBlockOffset() {
        return absPosToBlockOffset;
    }

    protected BufferOverReadableChannel getBufferByBaseOffset(long baseOffset) {
        Integer index = posToIndex.get(baseOffset);
        return getBufferByIndex(index);
    }


    protected BufferOverReadableChannel getBufferByIndex(int index) {
        // Sanity check
        if (index == 0 && debufferedHead != null) {
            throw new IllegalStateException("Should never be called if in debuffered state");
        }
        return getBufferByIndexUnsafe(index);
    }


    protected BufferOverReadableChannel getBufferByIndexUnsafe(int index) {
        BufferOverReadableChannel result;
        switch (index) {
            case 0: result = headBuffer; break;
            case 1: result = tailBuffer; break;
            case 2: result = postambleBuffer; break;
            default: result = null; break;
        }
        return result;
    }


    protected void setupTailBuffer() {
        Map.Entry e = posToIndex.descendingMap().entrySet().iterator().next();
        long currentOffset = e.getKey();
        int currentIndex = e.getValue();

        // Assertion
        if (currentIndex != 0) {
            throw new IllegalStateException("Method may only be called during reads from the head buffer");
        }

        int nextIndex = currentIndex + 1;
        BufferOverReadableChannel nextBuffer = getBufferByIndex(nextIndex);
        if (nextBuffer != null) {
            BufferOverReadableChannel currentBuffer = getBufferByIndex(currentIndex);
            boolean doSanityCheck = true;
            if (doSanityCheck) {
                if (!currentBuffer.isDataSupplierConsumed()) {
                    throw new IllegalStateException("Attempt to set up the next buffer although the current one has not been exhausted.");
                }
            }

            long currentSize = currentBuffer.getKnownDataSize();
            long nextOffset = currentOffset + currentSize;
            posToIndex.put(nextOffset, nextIndex);
        }
    }

    public BufferOverReadableChannel getHeadBuffer() {
        return headBuffer;
    }

    public BufferOverReadableChannel getTailBuffer() {
        return tailBuffer;
    }

    public static SeekableSourceOverSplit createForNonEncodedStream(SeekableInputStream in, long splitPoint, byte[] postambleBytes) {
        SeekableReadableChannel baseStream = SeekableInputStreams.wrap(in);
        SeekableReadableChannel headStream = new SeekableReadableChannelWithLimit<>(SeekableReadableChannels.closeShield(baseStream), splitPoint);

        return create(baseStream, headStream, postambleBytes, null);
    }
    public static SeekableSourceOverSplit createForBlockEncodedStream(SeekableInputStream inn, long splitPoint, byte[] postambleBytes) {
        NavigableMap absPosToBlockOffset = new TreeMap<>();

        // Not ideal to use the position without a guaranteed prior read
        absPosToBlockOffset.put(0l, inn.position());
        
        if (logger.isDebugEnabled()) {
        	logger.debug("Detected first block in encoded stream at offset: " + absPosToBlockOffset);
        }

        // Wrap the input stream such that the position always refers to the next byte being read
        InputStream in1 = new DeferredSeekablePushbackInputStream(inn) {
            protected long readCount = 0;

            @Override
            protected int readInternal(byte[] b, int off, int len) throws IOException {
                long before = inn.position();
                int result = super.readInternal(b, off, len);
                long after = inn.position();

                if (after != before) {
                    // System.err.println("Block detected: " + after + " -> " + readCount);
                    absPosToBlockOffset.put(readCount, after);
                }
                if (result > 0) {
                    readCount += result;
                }
                return result;
            }
        };

        // We need the position() functionality of the baseStream - but we won't be using its seeking capabilities
        SeekableReadableChannel baseStream = SeekableInputStreams.wrap(SeekableInputStreams.create(in1, (Seekable)in1));


        // SeekableReadableChannel base = SeekableReadableChannel(dataSupplier);

        // long initialPos = baseStream.position();
        // long initialSplitId = posToSplitId.apply(initialPos);

        // Wrap the stream that when reading past the split point any data is buffered with the tailBuffer
        ReadableChannel headStream = new ReadableChannelWithConditionalBound<>(baseStream,
            self -> {
                long pos = baseStream.position();
                // long splitId = posToSplitId.apply(pos);
                boolean isEof = pos >= splitPoint;
                if (isEof) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Found first block after split " + splitPoint + " at " + pos);
                    }
                }
                return isEof;
            });

        return create(baseStream, ReadableChannels.closeShield(headStream), postambleBytes, absPosToBlockOffset);
    }

    protected static SeekableSourceOverSplit create(
            ReadableChannel baseStream, ReadableChannel headStream, byte[] postambleBytes, NavigableMap blockOffsetToAbsPos) {
        BufferOverReadableChannel headBuffer = BufferOverReadableChannel.createForBytes(headStream, 8192);
        BufferOverReadableChannel tailBuffer = BufferOverReadableChannel.createForBytes(baseStream, 8192);
        BufferOverReadableChannel postambleBuffer = BufferOverReadableChannel.createForBytes(ReadableChannels.wrap(new ByteArrayInputStream(postambleBytes)), 8192);
        return new SeekableSourceOverSplit(baseStream, headBuffer, tailBuffer, postambleBuffer, blockOffsetToAbsPos);
    }

    public long getHeadSize() {
        long index = posToIndex.entrySet().stream()
                .filter(e -> e.getValue() == 1)
                .map(Map.Entry::getKey)
                .findFirst()
                .orElseThrow(() -> new IllegalStateException("Head size not yet detected"));
        return index;
    }

    @Override
    public Channel newReadableChannel() throws IOException {

        if (debufferedHead != null) {
            throw new RuntimeException("Already debuffered");
        }

        // We cannot use the util method BufferOverReadableChannel.newBufferedChannel because
        //  the resulting channel is not seekable...

        // BufferOverReadableChannel.newBufferedChannel(headBuffer);
        // headBuffer.newReadableChannel()
        SeekableReadableChannel baseChannel = headBuffer.newReadableChannel();
        return new Channel(baseChannel, 0, -1, null);
    }

//    @Override
//    public SeekableReadableChannel newReadableChannel(long offset) throws IOException {
//        return ne
//    }

    @Override
    public long size() throws IOException {
        return headBuffer.getKnownDataSize() + tailBuffer.getKnownDataSize();
    }

    @Override
    public ArrayOps getArrayOps() {
        return ArrayOps.BYTE;
    }

    class Channel
        extends SeekableReadableChannelBase
    {
        protected SeekableReadableChannel currentStream;
        // protected boolean isHeadStream;

        // The offset at which the currentStream starts
        // protected int currentStreamId;
        protected long currentStreamOffset;

        protected long requestedPos;

        // protected long limitPos;

        protected Runnable transitionAction;

        protected ReentrantReadWriteLock rwl = new ReentrantReadWriteLock();

        public Channel(SeekableReadableChannel currentStream, long currentStreamOffset, long requestedPos, Runnable transitionAction) {
            this.currentStream = currentStream;
            this.currentStreamOffset = currentStreamOffset;
            this.requestedPos = requestedPos;
            this.transitionAction = transitionAction;
        }

        public ReadWriteLock getReadWriteLock() {
            return rwl;
        }

        /** True iff the next call to read() reads from the head stream */
        public boolean isHeadStream() {
            int streamId = posToIndex.get(currentStreamOffset);

            // We may be positioned exactly at the end of the head stream:
            // In this case, the read() method call has already placed the entry for the tail offset
            // and called the transition action
            boolean result = streamId == 0 && posToIndex.size() == 1;
            return result;
        }

        protected boolean isDebuffered() {
            return debufferedHead != null;
        }

        public void debufferHead() {
            if (!rwl.isWriteLocked()) {
                throw new IllegalStateException("Debuffering requires the channel's write lock to be locked");
            }

            if (isDebuffered()) {
                throw new RuntimeException("Already debuffered");
            }

            if (isHeadStream()) {
                long pos = position();
                long bufferSize = headBuffer.getKnownDataSize();
                ReadableChannel bufferChannel;

                ReadableChannel headDataSupplier = headBuffer.getDataSupplier();
                headBuffer.setDataSupplier(null); // TODO Set a always failing one because it should no longer be used.
                try {
                    bufferChannel = pos < bufferSize
                            ? headBuffer.getBuffer().newReadableChannel(pos)
                            : null;
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }

                ReadableChannel debuffered = bufferChannel == null
                        ? headDataSupplier
                        : ReadableChannels.concat(Arrays.asList(bufferChannel, headDataSupplier));

                debufferedHead = SeekableReadableChannels.wrapForwardSeekable(debuffered, pos);
                IOUtils.closeQuietly(currentStream);
                currentStream = debufferedHead;
            }
            // BufferOverReadableChannel.newBufferedChannel(headBuffer);
        }

        @Override
        public SeekableReadableChannel cloneObject() {
            try {
                long pos = position();
                return new Channel(currentStream.cloneObject(), currentStreamOffset, pos, transitionAction);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public long position() {
            long result = requestedPos >= 0 ? requestedPos : getInternalPosition();
            return result;
        }

        @Override
        public void position(long pos) {
            this.requestedPos = pos;
        }


        protected void applyPosition() throws IOException {
            long currentAbsPos = getInternalPosition();

            while (true) {
                long requestedBaseOffset = posToIndex.floorKey(requestedPos);
                Integer requestedIndex = posToIndex.get(requestedBaseOffset);

                long requiredAdditionalBytes = requestedPos - currentAbsPos;
                long currentRelPos = requestedPos - requestedBaseOffset;

                if (requestedIndex == 0 && isDebuffered()) {
                    // logger.debug("Debuffered stream pos:" + currentStream.position());
                    // logger.debug("Requested pos: " + currentRelPos);
                    currentStream.position(currentRelPos);
                    break;
                    // currentStreamOffset = 0;
                } else {

                    BufferOverReadableChannel requestedBuffer = getBufferByBaseOffset(requestedBaseOffset);
                    if (requestedBaseOffset != currentStreamOffset) {
                        currentStream.close();
                        // currentStream = BufferOverReadableChannel.newBufferedChannel(currentBuffer);
                        currentStream = requestedBuffer.newReadableChannel();
                        currentStreamOffset = requestedBaseOffset;
                    }

                    if (requiredAdditionalBytes > 0) {
                        // TODO Make loadFully accept a long argument
                        requestedBuffer.loadFully(Ints.checkedCast(currentRelPos), true);
                    }

                    long knownDataSize = requestedBuffer.getKnownDataSize();
                    if (currentRelPos < knownDataSize || (currentRelPos == knownDataSize && !requestedBuffer.isDataSupplierConsumed())) {
                        currentStream.position(currentRelPos);
                        break;
                    } else {
                        int currentStreamIdx = posToIndex.get(currentStreamOffset);
                        if (currentStreamIdx == 0 && requestedBuffer.isDataSupplierConsumed()) {
                            setupTailBuffer();
                        }

                        long nextRequestedBaseOffset = posToIndex.floorKey(requestedPos);
                        if (requestedBaseOffset == nextRequestedBaseOffset) {
                            currentStream.position(knownDataSize);
                            break;
                        } else {
                            continue;
                        }
                    }
                }
            }

            requestedPos = -1;
        }

        protected long getInternalPosition() {
            long relativePos = currentStream.position();
            long result = currentStreamOffset + relativePos;
            return result;
        }

        void setLimit(long newLimitPos) {
            // int size = posToIndex.size();
            int max = posToIndex.values().stream().mapToInt(x -> x).max().orElse(-1);
            if (max != 1) {
                throw new IllegalStateException("Limit can only be set once and only if data has been read from the tail region");
            }
//            if (limitPos != -1) {
//                throw new RuntimeException(String.format("Cannot re-set limit from %d to %d", limitPos, newLimitPos));
//            }
            // this.limitPos = newLimitPos;
            posToIndex.put(newLimitPos, 2);
        }

        @Override
        public int read(byte[] array, int position, int length) throws IOException {
            int result;
            if (length == 0) {
                result = 0;
            } else {
                Lock readLock = rwl.readLock();
                readLock.lock();
                try {
                    while (true) {
                        if (requestedPos >= 0) {
                            applyPosition();
                        }

                        int l = adjustLength(length);
                        if (l <= 0) {
                            long p = position();
                            position(p);
                            continue;
                        } else {
                            result = currentStream.read(array, position, l);
                            if (result == -1) {
                                boolean ihs = isHeadStream();
                                Object cs = currentStream;
                                long currentSize = ihs && isDebuffered()
                                        ? currentStream.position() - currentStreamOffset
                                        : getBufferByBaseOffset(currentStreamOffset).getKnownDataSize();

//                                long csPos = currentStream.position();
//                                long currentSize = csPos - currentStreamOffset; // getBufferByBaseOffset(currentStreamOffset).getKnownDataSize();


                                boolean exhaustedHeadStream = ihs; // We exhausted the stream by reading -1 -  && headBuffer.isDataSupplierConsumed(); // isHeadStream();
                                long newPos = currentStreamOffset + currentSize;
                                position(newPos);
                                if (exhaustedHeadStream) {
                                    posToIndex.put(newPos, 1);
                                    // setupTailBuffer();
                                }
                                applyPosition();

                                // If we did not move to a new stream then we reached the end
                                if (currentStream == cs) {
                                    break;
                                }

                                if (exhaustedHeadStream) {
                                    // currentStream.close();
                                    // isHeadStream = false;
                                    // currentStream = tailBuffer.newReadableChannel();

                                    transition();
                                }
                                continue;
                                // l = adjustLength(length);
                                // result = l <= 0 ? (length > 0 ? -1 : 0) : currentStream.read(array, position, l);
                            }
                            // requestedPos = -1; // getInternalPosition();
                        }
                        break;
                    }
                } finally {
                    readLock.unlock();
                }
            }
            if (result == -1) {
                // isEndReached = true;
                // System.out.println("EOF reached");
            }

            return result;
        }


        public int adjustLength(int length) {
            Long nextStreamOffset = posToIndex.higherKey(currentStreamOffset);
            int l;
            if (nextStreamOffset == null) {
                l = length;
            } else {
                long p = position();
                long delta = nextStreamOffset - p;
                l = Math.min(length, Ints.saturatedCast(delta));
            }
            return l;

//            int l;
//            if (limitPos < 0) {
//                l = length;
//            } else {
//                long p = position();
//                long delta = limitPos - p;
//                l = Math.min(length, Ints.saturatedCast(delta));
//            }
//            return l;
        }

        @Override
        public ArrayOps getArrayOps() {
            return ArrayOps.BYTE;
        }

        @Override
        protected void closeActual() throws Exception {
            LockUtils.runWithLock(rwl.writeLock(), () -> {
                currentStream.close();
                super.closeActual();
            });
        }

        public void setTransitionAction(Runnable transitionAction) {
            this.transitionAction = transitionAction;
        }

        protected void transition() {
            if (transitionAction != null) {
                transitionAction.run();
            }
        }

        SeekableSourceOverSplit getEnclosingInstance() {
            return SeekableSourceOverSplit.this;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy