
smile.data.SparseDatasetImpl Maven / Gradle / Ivy
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.data;
import java.util.*;
import java.util.stream.Stream;
import smile.util.SparseArray;
/**
* List of Lists sparse matrix format. LIL stores one list per row,
* where each entry stores a column index and value. Typically, these
* entries are kept sorted by column index for faster lookup.
* This format is good for incremental matrix construction.
*
* LIL is typically used to construct the matrix. Once the matrix is
* constructed, it is typically converted to a format, such as Harwell-Boeing
* column-compressed sparse matrix format, which is more efficient for matrix
* operations.
*
* @author Haifeng Li
*/
class SparseDatasetImpl implements SparseDataset {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SparseDatasetImpl.class);
/**
* The data objects.
*/
private final ArrayList> instances;
/**
* The number of nonzero entries.
*/
private int n;
/**
* The number of columns.
*/
private final int ncol;
/**
* The number of nonzero entries in each column.
*/
private int[] colSize;
/**
* Constructor.
* @param data The sample instances.
*/
public SparseDatasetImpl(Collection> data) {
this(data, 1 + data.stream().flatMapToInt(instance -> instance.x().indexStream()).max().orElse(0));
}
/**
* Constructor.
* @param data The sample instances.
* @param ncol The number of columns.
*/
public SparseDatasetImpl(Collection> data, int ncol) {
this.instances = new ArrayList<>(data);
this.ncol = ncol;
colSize = new int[ncol];
for (var instance : data) {
var x = instance.x();
x.sort(); // sort array index into ascending order.
int i = -1; // index of previous element
for (SparseArray.Entry e : x) {
if (e.index() < 0) {
throw new IllegalArgumentException(String.format("Negative index of nonzero element: %d", e.index()));
}
if (e.index() == i) {
logger.warn("Ignore duplicated indices: {} in {}", e.index(), x);
} else {
if (ncol <= e.index()) {
ncol = e.index() + 1;
int[] newColSize = new int[3 * ncol / 2];
System.arraycopy(colSize, 0, newColSize, 0, colSize.length);
colSize = newColSize;
}
colSize[e.index()]++;
n++;
i = e.index();
}
}
}
}
@Override
public int size() {
return instances.size();
}
@Override
public int nz() {
return n;
}
@Override
public int nz(int j) {
return colSize[j];
}
@Override
public int ncol() {
return ncol;
}
@Override
public SampleInstance get(int i) {
return instances.get(i);
}
@Override
public Stream> stream() {
return instances.stream();
}
@Override
public Iterator> iterator() {
return instances.iterator();
}
}