com.codetaco.funnel.publisher.AbstractPublisher Maven / Gradle / Ivy
package com.codetaco.funnel.publisher;
import java.io.DataOutput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codetaco.funnel.App;
import com.codetaco.funnel.Funnel;
import com.codetaco.funnel.FunnelDataPublisher;
import com.codetaco.funnel.aggregation.Aggregate;
import com.codetaco.funnel.columns.ColumnWriter;
import com.codetaco.funnel.parameters.DuplicateDisposition;
import com.codetaco.funnel.parameters.FunnelContext;
import com.codetaco.funnel.provider.FileSource;
import com.codetaco.funnel.provider.RandomAccessInputSource;
import com.codetaco.funnel.segment.SourceProxyRecord;
/**
*
* Abstract AbstractPublisher class.
*
*
* @author Chris DeGreef [email protected]
*/
abstract public class AbstractPublisher implements FunnelDataPublisher, ColumnWriter
{
static final private Logger logger = LoggerFactory.getLogger(AbstractPublisher.class);
static final int WriteBufferSize = 1 << 15;
FunnelContext context;
DataOutput writer;
SourceProxyRecord previousItem;
byte[] previousOriginalBytes;
RandomAccessInputSource originalFile;
byte[] originalBytes;
byte[] writeBuffer;
ByteBuffer bb;
long writeCount;
long duplicateCount;
/**
*
* Constructor for AbstractPublisher.
*
*
* @param _context a {@link com.codetaco.funnel.parameters.FunnelContext}
* object.
* @throws java.text.ParseException if any.
* @throws java.io.IOException if any.
*/
public AbstractPublisher(final FunnelContext _context) throws ParseException, IOException
{
context = _context;
initialize();
writeBuffer = new byte[WriteBufferSize];
bb = ByteBuffer.wrap(writeBuffer, 0, WriteBufferSize);
logger.debug("write buffer size is " + WriteBufferSize + " bytes");
}
/** {@inheritDoc} */
@Override
public void close() throws Exception
{
if (context.isAggregating() && previousItem != null)
/*
* Write last aggregation to disk
*/
formatOutputAndWrite(previousItem, previousOriginalBytes);
if (bb.position() != 0)
flushWritesToDisk();
originalFile.close();
context.outputCounters(duplicateCount, writeCount);
if (duplicateCount > 0)
logger.debug(Funnel.ByteFormatter.format(duplicateCount) + " duplicate rows");
logger.debug(Funnel.ByteFormatter.format(writeCount) + " rows written");
}
void flushWritesToDisk() throws IOException
{
writer.write(bb.array(), 0, bb.position());
bb.position(0);
}
/**
* @param item
* @param rawData
* @throws IOException
*/
void formatOutputAndWrite(final SourceProxyRecord item, final byte[] rawData)
throws IOException, Exception
{
writeCount++;
/*
* Prepare the aggregations for the next set of data.
*/
Aggregate.reset(context);
}
/** {@inheritDoc} */
@Override
public long getDuplicateCount()
{
return duplicateCount;
}
/** {@inheritDoc} */
@Override
public long getWriteCount()
{
return writeCount;
}
private void initialize() throws ParseException, IOException
{
if (context.isCacheInput() || context.isSysin())
originalFile = context.inputCache;
else
originalFile = new FileSource(context);
try
{
openOutput(context);
} catch (final IOException e)
{
App.abort(-1, e);
}
writeCount = duplicateCount = 0;
}
void loadOriginalBytes(final int originalFileNumber, final SourceProxyRecord item)
throws IOException
{
originalFile.read(originalFileNumber, originalBytes, item.originalLocation, item.originalSize);
}
/**
* @throws IOException
*/
void newLine() throws IOException
{
// new lines mean nothing at the abstract level.
}
/** {@inheritDoc} */
@Override
public void openInput() throws ParseException
{
try
{
originalFile.open();
} catch (final IOException e)
{
App.abort(-1, e);
}
}
abstract void openOutput(final FunnelContext _context) throws IOException, FileNotFoundException;
/** {@inheritDoc} */
@Override
public boolean publish(final SourceProxyRecord item, final long phase) throws Exception
{
/*
* The same goes for the original file number. But it is important not
* to loose this information because it is needed to get the original
* data.
*/
final int originalFileNumber = item.originalInputFileIndex;
item.originalInputFileIndex = 0;
int comparison = 0;
loadOriginalBytes(originalFileNumber, item);
item.getFunnelContext().columnHelper
.loadColumnsFromBytes(originalBytes, item.originalSize, item.getOriginalRecordNumber());
if (previousItem != null)
{
/*
* check to see if this item is in order, return false if not. The
* originalRecordNumber is only used to order duplicates. At this
* point it should not be used for comparisons since we want to make
* sure we know a duplicate has been found.
*/
comparison = previousItem.compareTo(item, false);
if (comparison > 0)
return false;
if (comparison == 0)
{
/*
* A duplicate record has been found.
*/
if (context.isAggregating())
{
/*
* Rather than write anything during an aggregation run we
* just aggregate until the key changes.
*/
Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
return true;
}
duplicateCount++;
if (DuplicateDisposition.FirstOnly == context.getDuplicateDisposition()
|| DuplicateDisposition.LastOnly == context.getDuplicateDisposition())
/*
* Since the file is sorted so that the duplicate we want to
* retain is first, and because it was not a duplicate until
* after it has been seen, we can easily ignore all
* duplicates.
*/
return true;
} else if (context.isUserSpecifiedOrder() && context.isAggregating())
{
/*
* If there is no orderBy then there is no key to be changed. So
* aggregates operate on the entire file.
*/
Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
return true;
}
} else
{
publishHeader();
if (context.isAggregating())
{
/*
* Never write the first record when aggregating. Wait until the
* key changes.
*/
Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
previousOriginalBytes = Arrays.copyOf(originalBytes, item.originalSize);
previousItem = item;
return true;
}
}
if (context.isAggregating())
{
/*
* We must reload the previous values into the columns since the new
* set of records has already started.
*/
item.getFunnelContext().columnHelper
.loadColumnsFromBytes(previousOriginalBytes, previousItem.originalSize, previousItem
.getOriginalRecordNumber());
formatOutputAndWrite(previousItem, previousOriginalBytes);
/*
* Now reload the newest record into the columns for processing.
*/
item.getFunnelContext().columnHelper
.loadColumnsFromBytes(originalBytes, item.originalSize, item.getOriginalRecordNumber());
Aggregate.aggregate(context, item.originalSize, item.getOriginalRecordNumber());
} else
formatOutputAndWrite(item, originalBytes);
/*
* Return the instance for reuse.
*/
if (previousItem != null)
previousItem.release();
previousItem = item;
previousOriginalBytes = Arrays.copyOf(originalBytes, item.originalSize);
return true;
}
void publishHeader() throws IOException
{
if (context.headerOutHelper.isWaitingToWrite())
{
context.headerOutHelper.format(context, this);
newLine();
}
}
/** {@inheritDoc} */
@Override
public void reset() throws IOException, ParseException
{
initialize();
if (previousItem != null)
{
previousItem.release();
previousItem = null;
}
}
/** {@inheritDoc} */
@Override
public void write(final byte[] sourceBytes, final int offset, final int length) throws IOException
{
if (length + bb.position() >= WriteBufferSize)
flushWritesToDisk();
bb.put(sourceBytes, offset, length);
}
}