com.codetaco.funnel.publisher.AbstractPublisher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of funnelsort Show documentation
A better sort for JAVA
There is a newer version: 3.0.5
package com.codetaco.funnel.publisher;

import java.io.DataOutput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.util.Arrays;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.codetaco.funnel.App;
import com.codetaco.funnel.Funnel;
import com.codetaco.funnel.FunnelDataPublisher;
import com.codetaco.funnel.aggregation.Aggregate;
import com.codetaco.funnel.columns.ColumnWriter;
import com.codetaco.funnel.parameters.DuplicateDisposition;
import com.codetaco.funnel.parameters.FunnelContext;
import com.codetaco.funnel.provider.FileSource;
import com.codetaco.funnel.provider.RandomAccessInputSource;
import com.codetaco.funnel.segment.SourceProxyRecord;

/**
 * 
 * Abstract AbstractPublisher class.
 * 
 *
 * @author Chris DeGreef [email protected]
 */
abstract public class AbstractPublisher implements FunnelDataPublisher, ColumnWriter
{
    static final private Logger logger          = LoggerFactory.getLogger(AbstractPublisher.class);

    static final int            WriteBufferSize = 1 << 15;

    FunnelContext               context;
    DataOutput                  writer;
    SourceProxyRecord           previousItem;
    byte[]                      previousOriginalBytes;
    RandomAccessInputSource     originalFile;
    byte[]                      originalBytes;
    byte[]                      writeBuffer;
    ByteBuffer                  bb;
    long                        writeCount;
    long                        duplicateCount;

    /**
     * 
     * Constructor for AbstractPublisher.
     * 
     *
     * @param _context a {@link com.codetaco.funnel.parameters.FunnelContext}
     *            object.
     * @throws java.text.ParseException if any.
     * @throws java.io.IOException if any.
     */
    public AbstractPublisher(final FunnelContext _context) throws ParseException, IOException
    {
        context = _context;

        initialize();

        writeBuffer = new byte[WriteBufferSize];
        bb = ByteBuffer.wrap(writeBuffer, 0, WriteBufferSize);

        logger.debug("write buffer size is " + WriteBufferSize + " bytes");
    }

    /** {@inheritDoc} */
    @Override
    public void close() throws Exception
    {
        if (context.isAggregating() && previousItem != null)
            /*
             * Write last aggregation to disk
             */
            formatOutputAndWrite(previousItem, previousOriginalBytes);
        if (bb.position() != 0)
            flushWritesToDisk();
        originalFile.close();

        context.outputCounters(duplicateCount, writeCount);

        if (duplicateCount > 0)
            logger.debug(Funnel.ByteFormatter.format(duplicateCount) + " duplicate rows");
        logger.debug(Funnel.ByteFormatter.format(writeCount) + " rows written");
    }

    void flushWritesToDisk() throws IOException
    {
        writer.write(bb.array(), 0, bb.position());
        bb.position(0);
    }

    /**
     * @param item
     * @param rawData
     * @throws IOException
     */
    void formatOutputAndWrite(final SourceProxyRecord item, final byte[] rawData)
            throws IOException, Exception
    {
        writeCount++;
        /*
         * Prepare the aggregations for the next set of data.
         */
        Aggregate.reset(context);
    }

    /** {@inheritDoc} */
    @Override
    public long getDuplicateCount()
    {
        return duplicateCount;
    }

    /** {@inheritDoc} */
    @Override
    public long getWriteCount()
    {
        return writeCount;
    }

    private void initialize() throws ParseException, IOException
    {
        if (context.isCacheInput() || context.isSysin())
            originalFile = context.inputCache;
        else
            originalFile = new FileSource(context);

        try
        {
            openOutput(context);
        } catch (final IOException e)
        {
            App.abort(-1, e);
        }
        writeCount = duplicateCount = 0;
    }

    void loadOriginalBytes(final int originalFileNumber, final SourceProxyRecord item)
            throws IOException
    {
        originalFile.read(originalFileNumber, originalBytes, item.originalLocation, item.originalSize);
    }

    /**
     * @throws IOException
     */
    void newLine() throws IOException
    {
        // new lines mean nothing at the abstract level.
    }

    /** {@inheritDoc} */
    @Override
    public void openInput() throws ParseException
    {
        try
        {
            originalFile.open();
        } catch (final IOException e)
        {
            App.abort(-1, e);
        }
    }

    abstract void openOutput(final FunnelContext _context) throws IOException, FileNotFoundException;

    /** {@inheritDoc} */
    @Override
    public boolean publish(final SourceProxyRecord item, final long phase) throws Exception
    {
        /*
         * The same goes for the original file number. But it is important not
         * to loose this information because it is needed to get the original
         * data.
         */
        final int originalFileNumber = item.originalInputFileIndex;
        item.originalInputFileIndex = 0;

        int comparison = 0;

        loadOriginalBytes(originalFileNumber, item);
        item.getFunnelContext().columnHelper
                .loadColumnsFromBytes(originalBytes, item.originalSize, item.getOriginalRecordNumber());

        if (previousItem != null)
        {
            /*
             * check to see if this item is in order, return false if not. The
             * originalRecordNumber is only used to order duplicates. At this
             * point it should not be used for comparisons since we want to make
             * sure we know a duplicate has been found.
             */
            comparison = previousItem.compareTo(item, false);

            if (comparison > 0)
                return false;
            if (comparison == 0)
            {
                /*
                 * A duplicate record has been found.
                 */
                if (context.isAggregating())
                {
                    /*
                     * Rather than write anything during an aggregation run we
                     * just aggregate until the key changes.
                     */
                    Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
                    return true;
                }
                duplicateCount++;
                if (DuplicateDisposition.FirstOnly == context.getDuplicateDisposition()
                        || DuplicateDisposition.LastOnly == context.getDuplicateDisposition())
                    /*
                     * Since the file is sorted so that the duplicate we want to
                     * retain is first, and because it was not a duplicate until
                     * after it has been seen, we can easily ignore all
                     * duplicates.
                     */
                    return true;
            } else if (context.isUserSpecifiedOrder() && context.isAggregating())
            {
                /*
                 * If there is no orderBy then there is no key to be changed. So
                 * aggregates operate on the entire file.
                 */
                Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
                return true;
            }

        } else
        {
            publishHeader();
            if (context.isAggregating())
            {
                /*
                 * Never write the first record when aggregating. Wait until the
                 * key changes.
                 */
                Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
                previousOriginalBytes = Arrays.copyOf(originalBytes, item.originalSize);
                previousItem = item;
                return true;
            }
        }

        if (context.isAggregating())
        {
            /*
             * We must reload the previous values into the columns since the new
             * set of records has already started.
             */
            item.getFunnelContext().columnHelper
                    .loadColumnsFromBytes(previousOriginalBytes, previousItem.originalSize, previousItem
                            .getOriginalRecordNumber());
            formatOutputAndWrite(previousItem, previousOriginalBytes);
            /*
             * Now reload the newest record into the columns for processing.
             */
            item.getFunnelContext().columnHelper
                    .loadColumnsFromBytes(originalBytes, item.originalSize, item.getOriginalRecordNumber());
            Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());

        } else
            formatOutputAndWrite(item, originalBytes);
        /*
         * Return the instance for reuse.
         */
        if (previousItem != null)
            previousItem.release();

        previousItem = item;
        previousOriginalBytes = Arrays.copyOf(originalBytes, item.originalSize);
        return true;
    }

    void publishHeader() throws IOException
    {
        if (context.headerOutHelper.isWaitingToWrite())
        {
            context.headerOutHelper.format(context, this);
            newLine();
        }
    }

    /** {@inheritDoc} */
    @Override
    public void reset() throws IOException, ParseException
    {
        initialize();
        if (previousItem != null)
        {
            previousItem.release();
            previousItem = null;
        }
    }

    /** {@inheritDoc} */
    @Override
    public void write(final byte[] sourceBytes, final int offset, final int length) throws IOException
    {
        if (length + bb.position() >= WriteBufferSize)
            flushWritesToDisk();
        bb.put(sourceBytes, offset, length);
    }
}