Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.compress;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.PriorityQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.random.Well1024a;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.lops.MMTSJ.MMTSJType;
import org.apache.sysml.lops.MapMultChain.ChainType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.compress.ColGroup.CompressionType;
import org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator;
import org.apache.sysml.runtime.compress.estim.CompressedSizeInfo;
import org.apache.sysml.runtime.compress.estim.SizeEstimatorFactory;
import org.apache.sysml.runtime.compress.utils.ConverterUtils;
import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils;
import org.apache.sysml.runtime.controlprogram.caching.CacheBlock;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType;
import org.apache.sysml.runtime.controlprogram.parfor.stat.Timing;
import org.apache.sysml.runtime.functionobjects.Builtin;
import org.apache.sysml.runtime.functionobjects.Builtin.BuiltinCode;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.ReduceAll;
import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysml.runtime.instructions.cp.ScalarObject;
import org.apache.sysml.runtime.matrix.data.CTableMap;
import org.apache.sysml.runtime.matrix.data.LibMatrixBincell;
import org.apache.sysml.runtime.matrix.data.LibMatrixReorg;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.MatrixValue;
import org.apache.sysml.runtime.matrix.data.RandomMatrixGenerator;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue;
import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.COVOperator;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.runtime.matrix.operators.QuaternaryOperator;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
import org.apache.sysml.runtime.matrix.operators.UnaryOperator;
import org.apache.sysml.runtime.util.IndexRange;
/**
* Experimental version of MatrixBlock that allows a compressed internal
* representation.
*/
public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
{
private static final long serialVersionUID = 7319972089143154057L;
//internal configuration
public static final boolean TRANSPOSE_INPUT = true;
public static final boolean MATERIALIZE_ZEROS = false;
public static final long MIN_PAR_AGG_THRESHOLD = 16*1024*1024; //16MB
public static final boolean INVESTIGATE_ESTIMATES = false;
private static final boolean LDEBUG = false; //local debug flag
private static final Log LOG = LogFactory.getLog(CompressedMatrixBlock.class.getName());
static{
// for internal debugging only
if( LDEBUG ) {
Logger.getLogger("org.apache.sysml.runtime.compress")
.setLevel((Level) Level.DEBUG);
}
}
protected ArrayList
_colGroups = null;
protected CompressionStatistics _stats = null;
public CompressedMatrixBlock() {
super(-1, -1, true);
}
/**
* Main constructor for building a block from scratch.
*
* @param rl
* number of rows in the block
* @param cl
* number of columns
* @param sparse
* true if the UNCOMPRESSED representation of the block should be
* sparse
*/
public CompressedMatrixBlock(int rl, int cl, boolean sparse) {
super(rl, cl, sparse);
}
/**
* "Copy" constructor to populate this compressed block with the
* uncompressed contents of a conventional block. Does not compress
* the block.
*
* @param mb matrix block
*/
public CompressedMatrixBlock(MatrixBlock mb) {
super(mb.getNumRows(), mb.getNumColumns(), mb.isInSparseFormat());
//shallow copy (deep copy on compression, prevents unnecessary copy)
if( isInSparseFormat() )
sparseBlock = mb.getSparseBlock();
else
denseBlock = mb.getDenseBlock();
nonZeros = mb.getNonZeros();
}
/**
* Obtain the column groups.
*
* @return the column groups constructed by the compression process.
*
*/
public ArrayList
getColGroups() {
return _colGroups;
}
/**
* Obtain whether this block is in compressed form or not.
*
* @return true if this block is in compressed form; false if the block has
* not yet been compressed
*/
public boolean isCompressed() {
return (_colGroups != null);
}
public boolean isSingleUncompressedGroup(){
return (_colGroups!=null && _colGroups.size()==1
&& _colGroups.get(0) instanceof ColGroupUncompressed);
}
private void allocateColGroupList() {
_colGroups = new ArrayList
();
}
@Override
public boolean isEmptyBlock(boolean safe) {
if( !isCompressed() )
return super.isEmptyBlock(safe);
return (_colGroups == null || getNonZeros()==0);
}
/**
* Compress the contents of this matrix block. After compression, the
* uncompressed data is discarded. Attempts to update this block after
* calling this method currently result in INCORRECT BEHAVIOR, something
* which should be fixed if we move ahead with this compression strategy.
*
* +per column sparsity
*
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public void compress()
throws DMLRuntimeException
{
//default sequential execution
compress(1);
}
/**
* Compress block.
*
* @param k number of threads
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public void compress(int k)
throws DMLRuntimeException
{
//check for redundant compression
if( isCompressed() ){
throw new DMLRuntimeException("Redundant compression, block already compressed.");
}
Timing time = new Timing(true);
_stats = new CompressionStatistics();
// SAMPLE-BASED DECISIONS:
// Decisions such as testing if a column is amenable to bitmap
// compression or evaluating co-coding potentionls are made based on a
// subset of the rows. For large datasets, sampling might take a
// significant amount of time. So, we generate only one sample and use
// it for the entire compression process.
//prepare basic meta data and deep copy / transpose input
final int numRows = getNumRows();
final int numCols = getNumColumns();
final boolean sparse = isInSparseFormat();
final double sp = OptimizerUtils.getSparsity(numRows, numCols, getNonZeros());
MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) :
LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
//construct sample-based size estimator
CompressedSizeEstimator bitmapSizeEstimator =
SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
// The current implementation of this method is written for correctness,
// not for performance or for minimal use of temporary space.
// We start with a full set of columns.
HashSet remainingCols = new HashSet();
for (int i = 0; i < numCols; i++)
remainingCols.add(i);
// PHASE 1: Classify columns by compression type
// We start by determining which columns are amenable to bitmap compression
double uncompressedColumnSize = getUncompressedSize(numRows, 1, sp);
// information about the bitmap amenable columns
List bitmapCols = new ArrayList();
List uncompressedCols = new ArrayList();
List colsCards = new ArrayList();
List compressedSizes = new ArrayList();
HashMap compressionRatios = new HashMap();
// Classify columns according to ration (size uncompressed / size compressed),
// where a column is compressible if ratio > 1.
CompressedSizeInfo[] sizeInfos = (k > 1) ?
computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) :
computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
for (int col = 0; col < numCols; col++) {
long compressedSize = sizeInfos[col].getMinSize();
double compRatio = uncompressedColumnSize / compressedSize;
if (compRatio > 1) {
bitmapCols.add(col);
compressionRatios.put(col, compRatio);
colsCards.add(sizeInfos[col].getEstCarinality());
compressedSizes.add(compressedSize);
}
else
uncompressedCols.add(col);
}
_stats.timePhase1 = time.stop();
if( LOG.isDebugEnabled() ) {
LOG.debug("Compression statistics:");
LOG.debug("--compression phase 1: "+_stats.timePhase1);
}
// PHASE 2: Grouping columns
// Divide the bitmap columns into column groups.
List bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(
bitmapSizeEstimator, bitmapCols, colsCards, compressedSizes, numRows,
isInSparseFormat() ? sp : 1, k);
_stats.timePhase2 = time.stop();
if( LOG.isDebugEnabled() )
LOG.debug("--compression phase 2: "+_stats.timePhase2);
if( INVESTIGATE_ESTIMATES ) {
double est = 0;
for( int[] groupIndices : bitmapColGrps )
est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
est += uncompressedCols.size() * uncompressedColumnSize;
_stats.estSize = est;
}
// PHASE 3: Compress and correct sample-based decisions
ColGroup[] colGroups = (k > 1) ?
compressColGroups(rawblock, bitmapSizeEstimator, compressionRatios, numRows, sp, bitmapColGrps, k) :
compressColGroups(rawblock, bitmapSizeEstimator, compressionRatios, numRows, sp, bitmapColGrps);
allocateColGroupList();
for( int j=0; j list = new ArrayList(remainingCols);
ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
_colGroups.add(ucgroup);
}
_stats.size = estimateCompressedSizeInMemory();
_stats.ratio= estimateSizeInMemory() / _stats.size;
//final cleanup (discard uncompressed block)
rawblock.cleanupBlock(true, true);
this.cleanupBlock(true, true);
_stats.timePhase4 = time.stop();
if( LOG.isDebugEnabled() ) {
LOG.debug("--compression phase 4: "+_stats.timePhase4);
LOG.debug("--num col groups: "+_colGroups.size());
LOG.debug("--compressed size: "+_stats.size);
LOG.debug("--compression ratio: "+_stats.ratio);
}
}
public CompressionStatistics getCompressionStatistics() {
return _stats;
}
private static CompressedSizeInfo[] computeCompressedSizeInfos(CompressedSizeEstimator estim, int clen) {
CompressedSizeInfo[] ret = new CompressedSizeInfo[clen];
for( int col=0; col tasks = new ArrayList();
for( int col=0; col> rtask = pool.invokeAll(tasks);
ArrayList ret = new ArrayList();
for( Future lrtask : rtask )
ret.add(lrtask.get());
pool.shutdown();
return ret.toArray(new CompressedSizeInfo[0]);
}
catch(Exception ex) {
throw new DMLRuntimeException(ex);
}
}
private static ColGroup[] compressColGroups(MatrixBlock in, CompressedSizeEstimator estim, HashMap compRatios, int rlen, double sp, List groups)
{
ColGroup[] ret = new ColGroup[groups.size()];
for( int i=0; i compRatios, int rlen, double sp, List groups, int k)
throws DMLRuntimeException
{
try {
ExecutorService pool = Executors.newFixedThreadPool( k );
ArrayList tasks = new ArrayList();
for( int[] colIndexes : groups )
tasks.add(new CompressTask(in, estim, compRatios, rlen, sp, colIndexes));
List> rtask = pool.invokeAll(tasks);
ArrayList
ret = new ArrayList
();
for( Future
lrtask : rtask )
ret.add(lrtask.get());
pool.shutdown();
return ret.toArray(new ColGroup[0]);
}
catch(Exception ex) {
throw new DMLRuntimeException(ex);
}
}
private static ColGroup compressColGroup(MatrixBlock in, CompressedSizeEstimator estim, HashMap compRatios, int rlen, double sp, int[] colIndexes)
{
int[] allGroupIndices = null;
int allColsCount = colIndexes.length;
CompressedSizeInfo sizeInfo;
// The compression type is decided based on a full bitmap since it
// will be reused for the actual compression step.
UncompressedBitmap ubm = null;
PriorityQueue compRatioPQ = null;
boolean skipGroup = false;
while (true)
{
//exact big list and observe compression ratio
ubm = BitmapEncoder.extractBitmap(colIndexes, in);
sizeInfo = estim.estimateCompressedColGroupSize(ubm);
double compRatio = getUncompressedSize(rlen, colIndexes.length, sp) / sizeInfo.getMinSize();
if( compRatio > 1 ) {
break; // we have a good group
}
// modify the group
if (compRatioPQ == null) {
// first modification
allGroupIndices = colIndexes.clone();
compRatioPQ = new PriorityQueue();
for (int i = 0; i < colIndexes.length; i++)
compRatioPQ.add(new CompressedColumn(i, compRatios.get(colIndexes[i])));
}
// index in allGroupIndices
int removeIx = compRatioPQ.poll().colIx;
allGroupIndices[removeIx] = -1;
allColsCount--;
if (allColsCount == 0) {
skipGroup = true;
break;
}
colIndexes = new int[allColsCount];
// copying the values that do not equal -1
int ix = 0;
for(int col : allGroupIndices)
if (col != -1)
colIndexes[ix++] = col;
}
//add group to uncompressed fallback
if( skipGroup )
return null;
//create compressed column group
long rleSize = sizeInfo.getRLESize();
long oleSize = sizeInfo.getOLESize();
if( rleSize < oleSize )
return new ColGroupRLE(colIndexes, rlen, ubm);
else
return new ColGroupOLE(colIndexes, rlen, ubm);
}
/**
* Compute a conservative estimate of the uncompressed size of a column group.
*
* @param rlen row length
* @param clen column length
* @param sparsity the sparsity
* @return estimate of uncompressed size of column group
*/
private static double getUncompressedSize(int rlen, int clen, double sparsity) {
//we estimate the uncompressed size as 8 * nnz in order to cover both
//sparse and dense with moderate underestimation (which is conservative as
//it is biased towards uncompressed columns)
return 8 * rlen * clen * sparsity;
}
/**
* Decompress block.
*
* @return a new uncompressed matrix block containing the contents of this
* block
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public MatrixBlock decompress() throws DMLRuntimeException
{
//early abort for not yet compressed blocks
if( !isCompressed() )
return new MatrixBlock(this);
Timing time = new Timing(true);
//preallocation sparse rows to avoid repeated reallocations
MatrixBlock ret = new MatrixBlock(getNumRows(), getNumColumns(), isInSparseFormat(), getNonZeros());
if( ret.isInSparseFormat() ) {
int[] rnnz = new int[rlen];
for (ColGroup grp : _colGroups)
grp.countNonZerosPerRow(rnnz, 0, rlen);
ret.allocateSparseRowsBlock();
SparseBlock rows = ret.getSparseBlock();
for( int i=0; i tasks = new ArrayList();
for( int i=0; i> rtasks = pool.invokeAll(tasks);
pool.shutdown();
for( Future