All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.codetaco.funnel.Funnel Maven / Gradle / Ivy

There is a newer version: 3.0.5
Show newest version
package com.codetaco.funnel;

import java.io.IOException;
import java.text.DecimalFormat;
import java.text.ParseException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.codetaco.argument.type.WildFiles;
import com.codetaco.funnel.parameters.FunnelContext;
import com.codetaco.funnel.provider.FunnelInternalNodeProvider;
import com.codetaco.funnel.segment.SegmentedPublisherAndProvider;
import com.codetaco.funnel.segment.SourceProxyRecord;

/**
 * This class provides the controller of the funnel (sort / merge) processing.
 * The main loop (Funnel#sort ) is located in this class but most of the
 * processing is encapsulated in the classes related to the providers and
 * publishers.
 * 

* A {@link com.codetaco.funnel.FunnelDataProvider} provides rows in an unsorted * order. These rows are deposited into the top of the funnel. The funnel is * funnel#shake and the rows drip out the bottom, one at a time, in sorted * order. As they exit the bottom of the funnel the rows are handed off to a * {@link com.codetaco.funnel.FunnelDataPublisher} . The publisher is * responsible for writing the sorted rows to the output destination. *

* The funnel tip is index 0 in the array. The largest row has the largest * indexes decreasing from left to right. * *

 * 6   5   4   3
 *   2       1
 *       0
 * 
*

* The entire funnel is populated by instances of * {@link com.codetaco.funnel.FunnelItem}. These are static and hold the current * state of the specific node in the funnel. Data is assigned to these nodes * throughout the process. Every FunnelItem has a FunnelDataProvider associated * with it. The top level of the funnel is assign a provider that typically gets * its rows from an original data source if in the first pass or a segment on * every other pass. *

* The maximum size of the funnel top is specified in a power of 2. For * instance, 16 would yield a top level of 65536 rows. The input provider will * lay down data rows across this top level and then pause until all of them * exit the bottom. This "phase" produces a single segment during the first * pass. If there are more than 65k rows then the input provider will lay down * another top row for sorting. This is done in phases like this so that it is * guaranteed to produce a full top row of sorted items in each phase. (A phase * is known to be a population of the top rows and the resulting segment that is * created. A pass is known to be an iteration over the entire data source that * may produce multiple segments.) *

* Funnel is also a tag sort. Tags are ripped off of the row when the input is * provided. The sort is then on this * {@link com.codetaco.funnel.segment.SourceProxyRecord} in order to reduce * memory requirements and disk work file size (space and IO time). * * * @author Chris DeGreef [email protected] */ public class Funnel { /** Constant SYS_RECORDSIZE="recordsize" */ public static final String SYS_RECORDSIZE = "recordsize"; /** Constant SYS_RECORDNUMBER="recordnumber" */ public static final String SYS_RECORDNUMBER = "recordnumber"; static Logger logger = LoggerFactory.getLogger(Funnel.class.getName()); /** Constant ByteFormatter */ static final public DecimalFormat ByteFormatter = new DecimalFormat("###,###,###,###"); /** * The maximum depth is the highest (number of levels) the funnel can be. * This is somewhat arbitrary and can be increased if there is sufficient * memory. */ static final public int MAXIMUM_DEPTH = 16; /** * the depth that will be used if none is provided on the constructor. */ static final public int DEFAULT_DEPTH = 6; /** * Sort an input stream into an output stream according to the command line * arguments. A {@link com.codetaco.funnel.parameters.FunnelContext} handles * the parsing of this command line and encapsulates all of the sorting * requirements. A {@link com.codetaco.funnel.orderby.KeyHelper} is created * for handling the creation of keys from each row that is provided. * Finally, a set of {@link com.codetaco.funnel.FunnelDataProvider} / * {@link com.codetaco.funnel.FunnelDataPublisher} instances are created and * the Funnel#sort process is started. * * @param args a {@link java.lang.String} object. * @throws Exception if any. * @param cfg a {@link com.codetaco.funnel.AppContext} object. * @return a {@link com.codetaco.funnel.parameters.FunnelContext} object. * @throws java.lang.Throwable if any. */ static public FunnelContext sort(final AppContext cfg, final String... args) throws Throwable { FunnelContext context = null; try { context = new FunnelContext(cfg, args); if (context.isUsageRun()) { logger.info("usage only"); return context; } if (context.isVersion()) { logger.info("version check only"); return context; } if (context.isSyntaxOnly()) { logger.info("syntax check complete, processing stopped"); System.out.println("Syntax only run - OK"); return context; } final Funnel funnel = new Funnel(context); if (context.isMultisourceInput() && context.isInPlaceSort()) { /* * The replace option with multiple input files causes each file * to be sorted on its own. */ final WildFiles inputFiles = context.getInputFiles(); for (int fIdx = 0; fIdx < inputFiles.files().size(); fIdx++) { context.setInputFiles(new WildFiles(inputFiles.files().get(fIdx))); if (fIdx != 0) /* * All subsequent processing after the first must issue * a reset after the file has been swapped out to the * next one. */ funnel.reset(); funnel.process(); } } else /* * All other combinations of options can be handled with a * single call to the process. */ funnel.process(); logger.info("Counters input({}) selected({}) duplicates({}) output({})", context .getRecordCount(), context.getRecordCount() - context.getUnselectedCount(), context .getDuplicateCount(), context.getWriteCount()); logger.debug("{} funnel nodes, {} rows per phase", funnel.items.length, funnel.maxSorted); logger.debug("{} source proxies cached in core", SourceProxyRecord.AvailableInstances.size()); logger.debug("{} available processors", Runtime.getRuntime().availableProcessors()); logger.debug("memory used({}) free({}) total({}) max({})", ByteFormatter .format(Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) .trim(), ByteFormatter.format(Runtime.getRuntime().freeMemory()).trim(), ByteFormatter .format(Runtime.getRuntime().totalMemory()) .trim(), ByteFormatter.format(Runtime.getRuntime().maxMemory()).trim()); } catch (final Exception e) { if (logger.isDebugEnabled()) logger.error(e.getMessage(), e); else logger.error(e.getMessage()); throw e; } finally { logger.info("================= END ===================="); } return context; } final FunnelContext context; /** * The entry row (where data enters the funnel) is the top row. The indexing * of the funnel is in a 1 dimensional array where the 0th element is the * exit of the funnel. entryRowStart is the index of the first FunnelItem in * the top row. */ final int entryRowStart; /** * The entry row (where data enters the funnel) is the top row. The indexing * of the funnel is in a 1 dimensional array where the 0th element is the * exit of the funnel. entryRowEnd is the index of the last FunnelItem in * the top row. */ final int entryRowEnd; /** * Items is the funnel. Index 0 is the exit point. */ private final FunnelItem[] items; /** * This is the size of the top row. It is the maximum number of rows that * can be sorted into a segment during the 1st pass of the data. */ final int maxSorted; /** * Create the memory layout for the sort. Compute the important indexes once * and store them so that time is not wasted in powerful math functions * while the sort is in progress. * * @param _context a {@link com.codetaco.funnel.parameters.FunnelContext} * object. */ public Funnel(final FunnelContext _context) { assert _context.getDepth() <= MAXIMUM_DEPTH : "depth can not exceed " + MAXIMUM_DEPTH; assert _context.getDepth() > 0 : "depth must be > 0"; context = _context; items = new FunnelItem[(1 << _context.getDepth()) - 1]; entryRowStart = (1 << _context.getDepth()) - 2; entryRowEnd = (1 << (_context.getDepth() - 1)) - 1; maxSorted = 1 << (_context.getDepth() - 1); } /** * Computes the left of two nodes that provide input for the provided node's * index. * * @param winnersCircle * @return */ FunnelItem contestantOne(final int winnersCircle) { final int c = winnersCircle * 2 + 2; if (c > entryRowStart) return null; return getItems()[c]; } /** * Computes the right of two nodes that provide input for the provided * node's index. * * @param winnersCircle * @return */ FunnelItem contestantTwo( final int winnersCircle) { final int c = winnersCircle * 2 + 1; if (c > entryRowStart) return null; return getItems()[c]; } /** *

* entryRowEnd. *

* * @return a int. */ public int entryRowEnd() { return entryRowEnd; } /** *

* entryRowStart. *

* * @return a int. */ public int entryRowStart() { return entryRowStart; } FunnelItem[] getItems() { return items; } /** * Set each node in the funnel to initial values. This is done between * passes. * * @param phase */ void initializePhase( final long phase) { for (final FunnelItem item : getItems()) { item.setEndOfData(false); item.setData(null); /* * This should cause any provider to give up a row. */ item.setPhase(-1); } } /** * The maximum is the size of the entry row in the funnel. And this just * applies to the first pass. * * @return a int. */ public int maximumGuaranteedNumberOfSortableItems() { return maxSorted; } /** * The funnel is made up of only instances of {@link FunnelItem}. The * parameter {@link FunnelDataProvider} is attached to each of the top row * (entry row) nodes. When the node needs a data row it will ask the * provider for one. The funnel is populated with new instances of * FunnelItem on the first time only. Every other pass only resets the * FunnelItem. *

* Other than the entry row, all other nodes in the funnel are attached with * a {@link FunnelInternalNodeProvider}. This queries the two nodes above it * to determine which one is next in sorted order. This provider is the * essence of the sorting process. * * @param provider */ void populateFunnel( final FunnelDataProvider provider) { /* * Apply data provider to the top of the funnel. */ for (int tr = entryRowStart(); tr >= entryRowEnd(); tr--) { if (getItems()[tr] != null) getItems()[tr].reset(); else getItems()[tr] = new FunnelItem(); provider.attachTo(getItems()[tr]); } /* * Apply the specific node provider for all other funnel nodes. They are * the same in sorting and in merging. So only create them once. */ for (int tr = entryRowEnd() - 1; tr >= 0; tr--) if (getItems()[tr] != null) getItems()[tr].reset(); else { getItems()[tr] = new FunnelItem(); new FunnelInternalNodeProvider(this, contestantOne(tr), contestantTwo(tr)).attachTo(getItems()[tr]); } } /** * The top row is primed when the funnel is empty. These are the next data * rows that will be sorted. The top row is pre-filled and then allowed to * be completely sorted before any more data is allowed in the funnel. At * which time the top row will be primed again. This keeps a predictable * size to each segment as it is created. * * @throws ParseException */ void primeTopRow(final long phase) throws IOException, ParseException { for (int tr = entryRowStart(); tr >= entryRowEnd(); tr--) getItems()[tr].next(phase); } /** * This is the main loop of Funnel sorting. It creates a funnel. Attaches * the provider to the top row and a publisher to the exit point. It * iterates over the data one pass at a time. The first pass pulls in all of * the data from the original source. The last pass moves all of the sorted * data to the specified output by using the publisher. If the sort is more * than a one pass sort (the number of rows exceeds the size of the entry * level of the funnel) a segmenting provider / publisher keeps work files * for the interim passes. * * @throws Exception */ void process() throws Exception { assert context.provider != null : "provider must not be null"; assert context.publisher != null : "publisher must not be null"; int passCount = 0; SegmentedPublisherAndProvider segmentationHandler = null; FunnelDataProvider passProvider; FunnelDataPublisher passPublisher = null; long passOneRowCount = 0; long passStartMS = 0; long passInitializedMS = 0; long passEndMS = 0; long passStartNano = 0; long passEndNano = 0; while (passPublisher != context.publisher) { passStartMS = System.currentTimeMillis(); passStartNano = System.nanoTime(); passCount++; if (segmentationHandler != null) { /* * every time except the first pass */ segmentationHandler.actAsProvider(); segmentationHandler.openInput(); passProvider = segmentationHandler; } else /* * the first pass only */ passProvider = context.provider; if (passProvider.maximumNumberOfRows() > maximumGuaranteedNumberOfSortableItems()) { /* * Every time except the last pass */ segmentationHandler = new SegmentedPublisherAndProvider(context); // segmentationHandler.open(); passPublisher = segmentationHandler; } else /* * Last pass only */ passPublisher = context.publisher; populateFunnel(passProvider); passPublisher.openInput(); passInitializedMS = System.currentTimeMillis(); long phase = 1; FunnelItem item; while (true) { item = shake(phase); if (item == null) { phase++; initializePhase(phase); primeTopRow(phase); item = shake(phase); if (item == null) break; } final boolean inorder = passPublisher.publish(item.getData(), phase); if (!inorder && passPublisher == context.publisher) throw new Exception("Sort failure. Check provider max rows (" + context.provider.maximumNumberOfRows() + ") and power (" + context.getDepth() + ")."); } passProvider.close(); passPublisher.close(); passEndMS = System.currentTimeMillis(); passEndNano = System.nanoTime(); if (passOneRowCount == 0) passOneRowCount = passProvider.actualNumberOfRows(); logger.debug("pass({}) init({}ms) io({}ms) {}({}) phases({})", passCount, passInitializedMS - passStartMS, passEndMS - passInitializedMS, (passCount == 1 ? "rows" : "segments"), passProvider.actualNumberOfRows(), phase - 1); } if (passOneRowCount > 0) { final long perRowMS = (passEndMS - passStartMS) / passOneRowCount; final long perRowNano = (passEndNano - passStartNano) / passOneRowCount; if (perRowMS > 0) logger.debug("perRow({}ms) rowsPerSecond({})", perRowMS, 1000L / perRowNano); else logger.debug("perRow({} nano) rowsPerSecond({})", perRowNano, 1000000000L / perRowNano); if (context.comparisonCounter > 0) logger.debug("Average comparison count per input row: {}. Total comparisons = {}", context.comparisonCounter / passOneRowCount, context.comparisonCounter); } } private void reset() throws IOException, ParseException { for (final FunnelItem item : getItems()) item.reset(); /* * If reset is being called it is because of multiple input files with * the --replace option. As of this time that is the only reason so * there is no checking done here. */ context.setOutputFile(context.getInputFile(context.inputFileIndex())); context.reset(); } /** * Shake the funnel so another data row drops down into the exit point of * the funnel. Obviously, {@link FunnelItem#next()} is a recursive call. It * starts at the exit point and works its way up to the top level of the * funnel. * * @return the FunnelItem at the exit point of the funnel. Or null if the * funnel is currently empty. * @throws ParseException */ FunnelItem shake(final long phase) throws IOException, ParseException { if (!getItems()[0].next(phase)) return null; return getItems()[0]; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy