Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
hex.tree.xgboost.matrix.SparseMatrixFactory Maven / Gradle / Ivy
package hex.tree.xgboost.matrix;
import hex.DataInfo;
import ai.h2o.xgboost4j.java.DMatrix;
import ai.h2o.xgboost4j.java.XGBoostError;
import water.H2O;
import water.LocalMR;
import water.MrFun;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;
import java.util.Arrays;
import java.util.Objects;
import static hex.tree.xgboost.matrix.MatrixFactoryUtils.setResponseAndWeightAndOffset;
import static hex.tree.xgboost.matrix.MatrixFactoryUtils.setResponseWeightAndOffset;
import static water.MemoryManager.*;
/*
- truly sparse matrix - no categoricals
- collect all nonzeros column by column (in parallel), then stitch together into final data structures
*/
public class SparseMatrixFactory {
public static MatrixLoader.DMatrixProvider csr(
Frame frame, int[] chunksIds, Vec weightsVec, Vec offsetsVec, Vec responseVec, // for setupLocal
DataInfo di, float[] resp, float[] weights, float[] offsets
) {
SparseMatrixDimensions sparseMatrixDimensions = calculateCSRMatrixDimensions(frame, chunksIds, weightsVec, di);
SparseMatrix sparseMatrix = allocateCSRMatrix(sparseMatrixDimensions);
int actualRows = initializeFromChunkIds(
frame, chunksIds, weightsVec, offsetsVec,
di, sparseMatrix, sparseMatrixDimensions,
responseVec, resp, weights, offsets);
return toDMatrix(sparseMatrix, sparseMatrixDimensions, actualRows, di.fullN(), resp, weights, offsets);
}
public static DMatrix csr(
Chunk[] chunks, int weight, int respIdx, int offsetIdx, // for MR task
DataInfo di, float[] resp, float[] weights, float[] offsets
) throws XGBoostError {
SparseMatrixDimensions sparseMatrixDimensions = calculateCSRMatrixDimensions(chunks, di, weight);
SparseMatrix sparseMatrix = allocateCSRMatrix(sparseMatrixDimensions);
int actualRows = initializeFromChunks(
chunks, weight,
di, sparseMatrix._rowHeaders, sparseMatrix._sparseData, sparseMatrix._colIndices,
respIdx, resp, weights, offsetIdx, offsets);
return toDMatrix(sparseMatrix, sparseMatrixDimensions, actualRows, di.fullN(), resp, weights, offsets).get();
}
public static class SparseDMatrixProvider extends MatrixLoader.DMatrixProvider {
private long[][] rowHeaders;
private int[][] colIndices;
private float[][] sparseData;
private DMatrix.SparseType csr;
private int shape;
private long nonZeroElementsCount;
public SparseDMatrixProvider(
long[][] rowHeaders,
int[][] colIndices,
float[][] sparseData,
DMatrix.SparseType csr,
int shape,
long nonZeroElementsCount,
int actualRows,
float[] response,
float[] weights,
float[] offsets
) {
super(actualRows, response, weights, offsets);
this.rowHeaders = rowHeaders;
this.colIndices = colIndices;
this.sparseData = sparseData;
this.csr = csr;
this.shape = shape;
this.nonZeroElementsCount = nonZeroElementsCount;
}
@Override
public DMatrix makeDMatrix() throws XGBoostError {
return new DMatrix(rowHeaders, colIndices, sparseData, csr, shape, (int) actualRows + 1, nonZeroElementsCount);
}
@Override
public void print(int nrow) {
NestedArrayPointer r = new NestedArrayPointer();
NestedArrayPointer d = new NestedArrayPointer();
long elemIndex = 0;
r.increment();
for (int i = 0; i < (nrow > 0 ? nrow : actualRows); i++) {
System.out.print(i + ":\t");
long rowEnd = r.get(rowHeaders);
r.increment();
for (; elemIndex < rowEnd; elemIndex++) {
System.out.print(d.get(colIndices) + ":" + d.get(sparseData) + "\t");
d.increment();
}
System.out.print(response[i]);
System.out.println();
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
SparseDMatrixProvider that = (SparseDMatrixProvider) o;
return shape == that.shape &&
nonZeroElementsCount == that.nonZeroElementsCount &&
Arrays.deepEquals(rowHeaders, that.rowHeaders) &&
Arrays.deepEquals(colIndices, that.colIndices) &&
Arrays.deepEquals(sparseData, that.sparseData) &&
csr == that.csr;
}
@Override
public int hashCode() {
int result = Objects.hash(super.hashCode(), csr, shape, nonZeroElementsCount);
result = 31 * result + Arrays.hashCode(rowHeaders);
result = 31 * result + Arrays.hashCode(colIndices);
result = 31 * result + Arrays.hashCode(sparseData);
return result;
}
}
public static SparseDMatrixProvider toDMatrix(
SparseMatrix sm, SparseMatrixDimensions smd, int actualRows, int shape, float[] resp, float[] weights, float[] offsets) {
return new SparseDMatrixProvider(
sm._rowHeaders, sm._colIndices, sm._sparseData, DMatrix.SparseType.CSR, shape, smd._nonZeroElementsCount,
actualRows, resp, weights, offsets
);
}
static class NestedArrayPointer {
int _row, _col;
public NestedArrayPointer() {
}
public NestedArrayPointer(long pos) {
this._row = (int) (pos / SparseMatrix.MAX_DIM);
this._col = (int) (pos % SparseMatrix.MAX_DIM);
}
void increment() {
_col++;
if(_col == SparseMatrix.MAX_DIM){
_col = 0;
_row++;
}
}
void set(long[][] dest, long val) {
dest[_row][_col] = val;
}
void set(float[][] dest, float val) {
dest[_row][_col] = val;
}
void set(int[][] dest, int val) {
dest[_row][_col] = val;
}
void setAndIncrement(long[][] dest, long val) {
set(dest, val);
increment();
}
public long get(long[][] dest) {
return dest[_row][_col];
}
public int get(int[][] dest) {
return dest[_row][_col];
}
public float get(float[][] dest) {
return dest[_row][_col];
}
}
public static int initializeFromChunkIds(
Frame frame, int[] chunks, Vec weightsVec, Vec offsetsVec, DataInfo di,
SparseMatrix matrix, SparseMatrixDimensions dimensions,
Vec respVec, float[] resp, float[] weights, float[] offsets
) {
InitializeCSRMatrixFromChunkIdsMrFun fun = new InitializeCSRMatrixFromChunkIdsMrFun(
frame, chunks, weightsVec, offsetsVec, di, matrix, dimensions, respVec, resp, weights, offsets
);
H2O.submitTask(new LocalMR(fun, chunks.length)).join();
return ArrayUtils.sum(fun._actualRows);
}
private static class InitializeCSRMatrixFromChunkIdsMrFun extends MrFun {
Frame _frame;
int[] _chunks;
Vec _weightVec;
Vec _offsetsVec;
DataInfo _di;
SparseMatrix _matrix;
SparseMatrixDimensions _dims;
Vec _respVec;
float[] _resp;
float[] _weights;
float[] _offsets;
// OUT
int[] _actualRows;
InitializeCSRMatrixFromChunkIdsMrFun(
Frame frame, int[] chunks, Vec weightVec, Vec offsetVec, DataInfo di,
SparseMatrix matrix, SparseMatrixDimensions dimensions,
Vec respVec, float[] resp, float[] weights, float[] offsets
) {
_actualRows = new int[chunks.length];
_frame = frame;
_chunks = chunks;
_weightVec = weightVec;
_offsetsVec = offsetVec;
_di = di;
_matrix = matrix;
_dims = dimensions;
_respVec = respVec;
_resp = resp;
_weights = weights;
_offsets = offsets;
}
@Override
protected void map(int chunkIdx) {
int chunk = _chunks[chunkIdx];
long nonZeroCount = _dims._precedingNonZeroElementsCounts[chunkIdx];
int rwRow = _dims._precedingRowCounts[chunkIdx];
NestedArrayPointer rowHeaderPointer = new NestedArrayPointer(rwRow);
NestedArrayPointer dataPointer = new NestedArrayPointer(nonZeroCount);
Chunk weightChunk = _weightVec != null ? _weightVec.chunkForChunkIdx(chunk) : null;
Chunk offsetChunk = _offsetsVec != null ? _offsetsVec.chunkForChunkIdx(chunk) : null;
Chunk respChunk = _respVec.chunkForChunkIdx(chunk);
Chunk[] featChunks = new Chunk[_frame.vecs().length];
for (int i = 0; i < featChunks.length; i++) {
featChunks[i] = _frame.vecs()[i].chunkForChunkIdx(chunk);
}
for(int i = 0; i < respChunk._len; i++) {
if (weightChunk != null && weightChunk.atd(i) == 0) continue;
rowHeaderPointer.setAndIncrement(_matrix._rowHeaders, nonZeroCount);
_actualRows[chunkIdx]++;
for (int j = 0; j < _di._cats; j++) {
dataPointer.set(_matrix._sparseData, 1);
if (featChunks[j].isNA(i)) {
dataPointer.set(_matrix._colIndices, _di.getCategoricalId(j, Float.NaN));
} else {
dataPointer.set(_matrix._colIndices, _di.getCategoricalId(j, featChunks[j].at8(i)));
}
dataPointer.increment();
nonZeroCount++;
}
for (int j = 0; j < _di._nums; j++) {
float val = (float) featChunks[_di._cats + j].atd(i);
if (val != 0) {
dataPointer.set(_matrix._sparseData, val);
dataPointer.set(_matrix._colIndices, _di._catOffsets[_di._catOffsets.length - 1] + j);
dataPointer.increment();
nonZeroCount++;
}
}
rwRow = setResponseWeightAndOffset(weightChunk, offsetChunk, respChunk, _resp, _weights, _offsets, rwRow, i);
}
rowHeaderPointer.set(_matrix._rowHeaders, nonZeroCount);
}
}
private static int initializeFromChunks(
Chunk[] chunks, int weight, DataInfo di, long[][] rowHeaders, float[][] data, int[][] colIndex,
int respIdx, float[] resp, float[] weights, int offsetIdx, float[] offsets
) {
int actualRows = 0;
int nonZeroCount = 0;
int rwRow = 0;
NestedArrayPointer rowHeaderPointer = new NestedArrayPointer();
NestedArrayPointer dataPointer = new NestedArrayPointer();
for (int i = 0; i < chunks[0].len(); i++) {
if (weight != -1 && chunks[weight].atd(i) == 0) continue;
actualRows++;
rowHeaderPointer.setAndIncrement(rowHeaders, nonZeroCount);
for (int j = 0; j < di._cats; j++) {
dataPointer.set(data, 1); //one-hot encoding
if (chunks[j].isNA(i)) {
dataPointer.set(colIndex, di.getCategoricalId(j, Float.NaN));
} else {
dataPointer.set(colIndex, di.getCategoricalId(j, chunks[j].at8(i)));
}
dataPointer.increment();
nonZeroCount++;
}
for (int j = 0; j < di._nums; j++) {
float val = (float) chunks[di._cats + j].atd(i);
if (val != 0) {
dataPointer.set(data, val);
dataPointer.set(colIndex, di._catOffsets[di._catOffsets.length - 1] + j);
dataPointer.increment();
nonZeroCount++;
}
}
rwRow = setResponseAndWeightAndOffset(chunks, respIdx, weight, offsetIdx, resp, weights, offsets, rwRow, i);
}
rowHeaderPointer.set(rowHeaders, nonZeroCount);
return actualRows;
}
/**
* Creates a {@link SparseMatrix} object with pre-instantiated backing arrays for row-oriented compression schema (CSR).
* All backing arrays are allocated using MemoryManager.
*
* @param sparseMatrixDimensions Dimensions of a sparse matrix
* @return An instance of {@link SparseMatrix} with pre-allocated backing arrays.
*/
public static SparseMatrix allocateCSRMatrix(SparseMatrixDimensions sparseMatrixDimensions) {
// Number of rows in non-zero elements matrix
final int dataRowsNumber = (int) (sparseMatrixDimensions._nonZeroElementsCount / SparseMatrix.MAX_DIM);
final int dataLastRowSize = (int)(sparseMatrixDimensions._nonZeroElementsCount % SparseMatrix.MAX_DIM);
//Number of rows in matrix with row indices
final int rowIndicesRowsNumber = (int)(sparseMatrixDimensions._rowHeadersCount / SparseMatrix.MAX_DIM);
final int rowIndicesLastRowSize = (int)(sparseMatrixDimensions._rowHeadersCount % SparseMatrix.MAX_DIM);
// Number of rows in matrix with column indices of sparse matrix non-zero elements
// There is one column index per each non-zero element, no need to recalculate.
final int colIndicesRowsNumber = dataRowsNumber;
final int colIndicesLastRowSize = dataLastRowSize;
// Sparse matrix elements (non-zero elements)
float[][] sparseData = new float[dataLastRowSize == 0 ? dataRowsNumber : dataRowsNumber + 1][];
int iterationLimit = dataLastRowSize == 0 ? sparseData.length : sparseData.length - 1;
for (int sparseDataRow = 0; sparseDataRow < iterationLimit; sparseDataRow++) {
sparseData[sparseDataRow] = malloc4f(SparseMatrix.MAX_DIM);
}
if (dataLastRowSize > 0) {
sparseData[sparseData.length - 1] = malloc4f(dataLastRowSize);
}
// Row indices
long[][] rowIndices = new long[rowIndicesLastRowSize == 0 ? rowIndicesRowsNumber : rowIndicesRowsNumber + 1][];
iterationLimit = rowIndicesLastRowSize == 0 ? rowIndices.length : rowIndices.length - 1;
for (int rowIndicesRow = 0; rowIndicesRow < iterationLimit; rowIndicesRow++) {
rowIndices[rowIndicesRow] = malloc8(SparseMatrix.MAX_DIM);
}
if (rowIndicesLastRowSize > 0) {
rowIndices[rowIndices.length - 1] = malloc8(rowIndicesLastRowSize);
}
// Column indices
int[][] colIndices = new int[colIndicesLastRowSize == 0 ? colIndicesRowsNumber : colIndicesRowsNumber + 1][];
iterationLimit = colIndicesLastRowSize == 0 ? colIndices.length : colIndices.length - 1;
for (int colIndicesRow = 0; colIndicesRow < iterationLimit; colIndicesRow++) {
colIndices[colIndicesRow] = malloc4(SparseMatrix.MAX_DIM);
}
if (colIndicesLastRowSize > 0) {
colIndices[colIndices.length - 1] = malloc4(colIndicesLastRowSize);
}
// Wrap backing arrays into a SparseMatrix object and return them
return new SparseMatrix(sparseData, rowIndices, colIndices);
}
protected static SparseMatrixDimensions calculateCSRMatrixDimensions(Chunk[] chunks, DataInfo di, int weightColIndex){
int[] nonZeroElementsCounts = new int[1];
int[] rowIndicesCounts = new int[1];
for (int i = 0; i < chunks[0].len(); i++) {
// Rows with zero weights are going to be ignored
if (weightColIndex != -1 && chunks[weightColIndex].atd(i) == 0) continue;
rowIndicesCounts[0]++;
nonZeroElementsCounts[0] += di._cats;
for (int j = 0; j < di._nums; j++) {
double val = chunks[di._cats + j].atd(i);
if (val != 0) {
nonZeroElementsCounts[0]++;
}
}
}
return new SparseMatrixDimensions(nonZeroElementsCounts, rowIndicesCounts);
}
public static SparseMatrixDimensions calculateCSRMatrixDimensions(Frame f, int[] chunkIds, Vec w, DataInfo di) {
CalculateCSRMatrixDimensionsMrFun fun = new CalculateCSRMatrixDimensionsMrFun(f, di, w, chunkIds);
H2O.submitTask(new LocalMR(fun, chunkIds.length)).join();
return new SparseMatrixDimensions(fun._nonZeroElementsCounts, fun._rowIndicesCounts);
}
private static class CalculateCSRMatrixDimensionsMrFun extends MrFun {
private Frame _f;
private DataInfo _di;
private Vec _w;
private int[] _chunkIds;
// OUT
private int[] _rowIndicesCounts;
private int[] _nonZeroElementsCounts;
CalculateCSRMatrixDimensionsMrFun(Frame f, DataInfo di, Vec w, int[] chunkIds) {
_f = f;
_di = di;
_w = w;
_chunkIds = chunkIds;
_rowIndicesCounts = new int[chunkIds.length];
_nonZeroElementsCounts = new int[chunkIds.length];
}
@Override
protected void map(int i) {
final int cidx = _chunkIds[i];
int rowIndicesCount = 0;
int nonZeroElementsCount = 0;
if (_di._nums == 0) {
if (_w == null) {
// no weights and only categoricals => sizing is trivial
rowIndicesCount = _f.anyVec().chunkForChunkIdx(cidx)._len;
nonZeroElementsCount = rowIndicesCount * _di._cats;
} else {
Chunk ws = _w.chunkForChunkIdx(cidx);
int nzWeights = 0;
for (int r = 0; r < ws._len; r++)
if (ws.atd(r) != 0) {
nzWeights++;
}
rowIndicesCount += nzWeights;
nonZeroElementsCount += nzWeights * _di._cats;
}
} else {
Chunk[] cs = new Chunk[_di._nums];
for (int c = 0; c < cs.length; c++) {
cs[c] = _f.vec(_di._cats + c).chunkForChunkIdx(cidx);
}
Chunk ws = _w != null ? _w.chunkForChunkIdx(cidx) : null;
for (int r = 0; r < cs[0]._len; r++) {
if (ws != null && ws.atd(r) == 0) continue;
rowIndicesCount++;
nonZeroElementsCount += _di._cats;
for (int j = 0; j < _di._nums; j++) {
if (cs[j].atd(r) != 0) {
nonZeroElementsCount++;
}
}
}
}
_rowIndicesCounts[i] = rowIndicesCount;
_nonZeroElementsCounts[i] = nonZeroElementsCount;
}
}
}