org.tribuo.math.la.SparseVector Maven / Gradle / Ivy
/*
* Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tribuo.math.la;
import org.tribuo.Dataset;
import org.tribuo.Example;
import org.tribuo.Feature;
import org.tribuo.ImmutableFeatureMap;
import org.tribuo.Output;
import org.tribuo.math.util.VectorNormalizer;
import org.tribuo.util.IntDoublePair;
import org.tribuo.util.Util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.function.DoubleUnaryOperator;
import java.util.stream.Collectors;
/**
* A sparse vector. Stored as a sorted array of indices and an array of values.
*
* Uses binary search to look up a specific index, so it's usually faster to
* use the iterator to iterate the values.
*
* This vector has immutable indices. It cannot get new indices after construction,
* and will throw {@link IllegalArgumentException} if such an operation is tried.
*/
public class SparseVector implements SGDVector {
private static final long serialVersionUID = 1L;
private final int[] shape;
protected final int[] indices;
protected final double[] values;
private final int size;
/**
* Used internally for performance.
* Does not defensively copy the input, nor check it's sorted.
*
* @param size The dimension of this vector.
* @param indices The indices.
* @param values The values.
*/
SparseVector(int size, int[] indices, double[] values) {
this.size = size;
this.shape = new int[]{size};
this.indices = indices;
this.values = values;
}
/**
* Returns a deep copy of the supplied sparse vector.
*
* Copies the value by iterating it's VectorTuple.
* @param other The SparseVector to copy.
*/
private SparseVector(SparseVector other) {
this.size = other.size;
int numActiveElements = other.numActiveElements();
this.indices = new int[numActiveElements];
this.values = new double[numActiveElements];
int i = 0;
for (VectorTuple tuple : other) {
indices[i] = tuple.index;
values[i] = tuple.value;
i++;
}
this.shape = new int[]{size};
}
public SparseVector(int size, int[] indices, double value) {
this.indices = Arrays.copyOf(indices,indices.length);
this.values = new double[indices.length];
Arrays.fill(this.values,value);
this.size = size;
this.shape = new int[]{size};
}
/**
* Builds a {@link SparseVector} from an {@link Example}.
*
* Used in training and inference.
*
* Throws {@link IllegalArgumentException} if the Example contains NaN-valued features.
* @param example The example to convert.
* @param featureInfo The feature information, used to calculate the dimension of this SparseVector.
* @param addBias Add a bias feature.
* @param The type parameter of the {@code example}.
* @return A SparseVector representing the example's features.
*/
public static > SparseVector createSparseVector(Example example, ImmutableFeatureMap featureInfo, boolean addBias) {
int size;
int numFeatures = example.size();
if (addBias) {
size = featureInfo.size() + 1;
numFeatures++;
} else {
size = featureInfo.size();
}
int[] tmpIndices = new int[numFeatures];
double[] tmpValues = new double[numFeatures];
int i = 0;
int prevIdx = -1;
for (Feature f : example) {
int index = featureInfo.getID(f.getName());
if (index > prevIdx){
prevIdx = index;
tmpIndices[i] = index;
tmpValues[i] = f.getValue();
if (Double.isNaN(tmpValues[i])) {
throw new IllegalArgumentException("Example contained a NaN feature, " + f.toString());
}
i++;
} else if (index > -1) {
//
// Collision, deal with it.
int collisionIdx = Arrays.binarySearch(tmpIndices,0,i,index);
if (collisionIdx < 0) {
//
// Collision but not present in tmpIndices
// move data and bump i
collisionIdx = - (collisionIdx + 1);
System.arraycopy(tmpIndices,collisionIdx,tmpIndices,collisionIdx+1,i-collisionIdx);
System.arraycopy(tmpValues,collisionIdx,tmpValues,collisionIdx+1,i-collisionIdx);
tmpIndices[collisionIdx] = index;
tmpValues[collisionIdx] = f.getValue();
if (Double.isNaN(tmpValues[collisionIdx])) {
throw new IllegalArgumentException("Example contained a NaN feature, " + f.toString());
}
i++;
} else {
//
// Collision present in tmpIndices
// add the values.
tmpValues[collisionIdx] += f.getValue();
if (Double.isNaN(tmpValues[collisionIdx])) {
throw new IllegalArgumentException("Example contained a NaN feature, " + f.toString());
}
}
}
}
if (addBias) {
tmpIndices[i] = size - 1;
tmpValues[i] = 1.0;
i++;
}
return new SparseVector(size,Arrays.copyOf(tmpIndices,i),Arrays.copyOf(tmpValues,i));
}
/**
* Defensively copies the input, and checks that the indices are sorted. If not,
* it sorts them.
*
* Throws {@link IllegalArgumentException} if the arrays are not the same length, or if size is less than
* the max index.
* @param dimension The dimension of this vector.
* @param indices The indices of the non-zero elements.
* @param values The values of the non-zero elements.
* @return A SparseVector encapsulating the indices and values.
*/
public static SparseVector createSparseVector(int dimension, int[] indices, double[] values) {
if (indices.length != values.length) {
throw new IllegalArgumentException("Indices and values must be the same length, found indices.length = " + indices.length + " and values.length = " + values.length);
} else if (indices.length == 0) {
return new SparseVector(dimension,indices,values);
} else {
IntDoublePair[] pairArray = new IntDoublePair[indices.length];
for (int i = 0; i < pairArray.length; i++) {
pairArray[i] = new IntDoublePair(indices[i], values[i]);
}
Arrays.sort(pairArray, IntDoublePair.pairIndexComparator());
int[] newIndices = new int[indices.length];
double[] newValues = new double[values.length];
for (int i = 0; i < pairArray.length; i++) {
newIndices[i] = pairArray[i].index;
newValues[i] = pairArray[i].value;
}
if (dimension < newIndices[newIndices.length - 1]) {
throw new IllegalArgumentException("Number of dimensions is less than the maximum index, dimensions = " + dimension + ", max index = " + newIndices[newIndices.length - 1]);
}
return new SparseVector(dimension, newIndices, newValues);
}
}
/**
* Builds a SparseVector from a map.
*
* Throws {@link IllegalArgumentException} if dimension is less than the max index.
* @param dimension The dimension of this vector.
* @param indexMap The map from indices to values.
* @return A SparseVector.
*/
public static SparseVector createSparseVector(int dimension, Map indexMap) {
if (indexMap.isEmpty()) {
return new SparseVector(dimension,new int[0],new double[0]);
} else {
List> sortedEntries = indexMap.entrySet()
.stream().sorted(Map.Entry.comparingByKey())
.collect(Collectors.toList());
int[] indices = new int[sortedEntries.size()];
double[] values = new double[sortedEntries.size()];
for (int i = 0; i < sortedEntries.size(); i++) {
indices[i] = sortedEntries.get(i).getKey();
values[i] = sortedEntries.get(i).getValue();
}
if (dimension < indices[indices.length - 1]) {
throw new IllegalArgumentException("Number of dimensions is less than the maximum index, dimensions = " + dimension + ", max index = " + indices[indices.length - 1]);
}
return new SparseVector(dimension, indices, values);
}
}
@Override
public SparseVector copy() {
return new SparseVector(this);
}
@Override
public int[] getShape() {
return shape;
}
@Override
public Tensor reshape(int[] newShape) {
throw new UnsupportedOperationException("Reshape not supported on sparse Tensors.");
}
@Override
public int size() {
return size;
}
@Override
public int numActiveElements() {
return values.length;
}
/**
* Equals is defined mathematically, that is two SGDVectors are equal iff they have the same indices
* and the same values at those indices.
* @param other Object to compare against.
* @return True if this vector and the other vector contain the same values in the same order.
*/
@Override
public boolean equals(Object other) {
if (other instanceof SGDVector) {
Iterator ourItr = iterator();
Iterator otherItr = ((SGDVector)other).iterator();
VectorTuple ourTuple;
VectorTuple otherTuple;
while (ourItr.hasNext() && otherItr.hasNext()) {
ourTuple = ourItr.next();
otherTuple = otherItr.next();
if (!ourTuple.equals(otherTuple)) {
return false;
}
}
// If one of the iterators still has elements then they are not the same.
return !(ourItr.hasNext() || otherItr.hasNext());
} else {
return false;
}
}
@Override
public int hashCode() {
int result = Objects.hash(size);
result = 31 * result + Arrays.hashCode(indices);
result = 31 * result + Arrays.hashCode(values);
return result;
}
/**
* Adds {@code other} to this vector, producing a new {@link SGDVector}.
* If {@code other} is a {@link SparseVector} then the returned vector is also
* a {@link SparseVector} otherwise it's a {@link DenseVector}.
* @param other The vector to add.
* @return A new {@link SGDVector} where each element value = this.get(i) + other.get(i).
*/
@Override
public SGDVector add(SGDVector other) {
if (other.size() != size) {
throw new IllegalArgumentException("Can't add two vectors of different dimension, this = " + size + ", other = " + other.size());
}
if (other instanceof DenseVector) {
return other.add(this);
} else if (other instanceof SparseVector) {
Map values = new HashMap<>();
for (VectorTuple tuple : this) {
values.put(tuple.index, tuple.value);
}
for (VectorTuple tuple : other) {
values.merge(tuple.index, tuple.value, Double::sum);
}
return createSparseVector(size, values);
} else {
throw new IllegalArgumentException("Vector other is not dense or sparse.");
}
}
/**
* Subtracts {@code other} from this vector, producing a new {@link SGDVector}.
* If {@code other} is a {@link SparseVector} then the returned vector is also
* a {@link SparseVector} otherwise it's a {@link DenseVector}.
* @param other The vector to subtract.
* @return A new {@link SGDVector} where each element value = this.get(i) - other.get(i).
*/
@Override
public SGDVector subtract(SGDVector other) {
if (other.size() != size) {
throw new IllegalArgumentException("Can't subtract two vectors of different dimension, this = " + size + ", other = " + other.size());
}
if (other instanceof DenseVector) {
DenseVector output = ((DenseVector)other).copy();
for (VectorTuple tuple : this) {
output.set(tuple.index,tuple.value-output.get(tuple.index));
}
return output;
} else if (other instanceof SparseVector) {
Map values = new HashMap<>();
for (VectorTuple tuple : this) {
values.put(tuple.index, tuple.value);
}
for (VectorTuple tuple : other) {
values.merge(tuple.index, -tuple.value, Double::sum);
}
return createSparseVector(size, values);
} else {
throw new IllegalArgumentException("Vector other is not dense or sparse.");
}
}
@Override
public void intersectAndAddInPlace(Tensor other, DoubleUnaryOperator f) {
if (other instanceof SparseVector) {
SparseVector otherVec = (SparseVector) other;
if (otherVec.size() != size) {
throw new IllegalArgumentException("Can't intersect two vectors of different dimension, this = " + size + ", other = " + otherVec.size());
} else if (otherVec.numActiveElements() > 0) {
int i = 0;
Iterator otherItr = otherVec.iterator();
VectorTuple tuple = otherItr.next();
while (i < (indices.length-1) && otherItr.hasNext()) {
if (indices[i] == tuple.index) {
values[i] += f.applyAsDouble(tuple.value);
i++;
tuple = otherItr.next();
} else if (indices[i] < tuple.index) {
i++;
} else {
tuple = otherItr.next();
}
}
for (; i < indices.length-1; i++) {
if (indices[i] == tuple.index) {
values[i] += f.applyAsDouble(tuple.value);
}
}
while (otherItr.hasNext()) {
if (indices[i] == tuple.index) {
values[i] += f.applyAsDouble(tuple.value);
}
tuple = otherItr.next();
}
if (indices[i] == tuple.index) {
values[i] += f.applyAsDouble(tuple.value);
}
}
} else if (other instanceof DenseVector) {
DenseVector otherVec = (DenseVector) other;
if (otherVec.size() != size) {
throw new IllegalArgumentException("Can't intersect two vectors of different dimension, this = " + size + ", other = " + otherVec.size());
}
for (int i = 0; i < indices.length; i++) {
values[i] += f.applyAsDouble(otherVec.get(indices[i]));
}
} else {
throw new IllegalStateException("Unknown Tensor subclass " + other.getClass().getCanonicalName() + " for input");
}
}
@Override
public void hadamardProductInPlace(Tensor other, DoubleUnaryOperator f) {
if (other instanceof SparseVector) {
SparseVector otherVec = (SparseVector) other;
if (otherVec.size() != size) {
throw new IllegalArgumentException("Can't hadamard product two vectors of different dimension, this = " + size + ", other = " + otherVec.size());
} else if (otherVec.numActiveElements() > 0) {
int i = 0;
Iterator otherItr = otherVec.iterator();
VectorTuple tuple = otherItr.next();
while (i < (indices.length-1) && otherItr.hasNext()) {
if (indices[i] == tuple.index) {
values[i] *= f.applyAsDouble(tuple.value);
i++;
tuple = otherItr.next();
} else if (indices[i] < tuple.index) {
i++;
} else {
tuple = otherItr.next();
}
}
for (; i < indices.length-1; i++) {
if (indices[i] == tuple.index) {
values[i] *= f.applyAsDouble(tuple.value);
}
}
while (otherItr.hasNext()) {
if (indices[i] == tuple.index) {
values[i] *= f.applyAsDouble(tuple.value);
}
tuple = otherItr.next();
}
if (indices[i] == tuple.index) {
values[i] *= f.applyAsDouble(tuple.value);
}
}
} else if (other instanceof DenseVector) {
DenseVector otherVec = (DenseVector) other;
if (otherVec.size() != size) {
throw new IllegalArgumentException("Can't hadamard product two vectors of different dimension, this = " + size + ", other = " + otherVec.size());
}
for (int i = 0; i < indices.length; i++) {
values[i] *= f.applyAsDouble(otherVec.get(indices[i]));
}
} else {
throw new IllegalArgumentException("Invalid Tensor subclass " + other.getClass().getCanonicalName() + " for input");
}
}
@Override
public void foreachInPlace(DoubleUnaryOperator f) {
for (int i = 0; i < values.length; i++) {
values[i] = f.applyAsDouble(values[i]);
}
}
@Override
public SparseVector scale(double coefficient) {
double[] newValues = Arrays.copyOf(values, values.length);
for (int i = 0; i < values.length; i++) {
newValues[i] *= coefficient;
}
return new SparseVector(size, Arrays.copyOf(indices, indices.length), newValues);
}
@Override
public void add(int index, double value) {
int foundIndex = Arrays.binarySearch(indices, index);
if (foundIndex < 0) {
throw new IllegalArgumentException("SparseVector cannot have new elements added.");
} else {
values[foundIndex] += value;
}
}
@Override
public double dot(SGDVector other) {
if (other.size() != size) {
throw new IllegalArgumentException("Can't dot two vectors of different lengths, this = " + size + ", other = " + other.size());
} else if (other instanceof SparseVector) {
double score = 0.0;
// If there are elements, calculate the dot product.
if ((other.numActiveElements() != 0) && (indices.length != 0)) {
Iterator itr = iterator();
Iterator otherItr = other.iterator();
VectorTuple tuple = itr.next();
VectorTuple otherTuple = otherItr.next();
while (itr.hasNext() && otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
score += tuple.value * otherTuple.value;
tuple = itr.next();
otherTuple = otherItr.next();
} else if (tuple.index < otherTuple.index) {
tuple = itr.next();
} else {
otherTuple = otherItr.next();
}
}
while (itr.hasNext()) {
if (tuple.index == otherTuple.index) {
score += tuple.value * otherTuple.value;
}
tuple = itr.next();
}
while (otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
score += tuple.value * otherTuple.value;
}
otherTuple = otherItr.next();
}
if (tuple.index == otherTuple.index) {
score += tuple.value * otherTuple.value;
}
}
return score;
} else if (other instanceof DenseVector) {
double score = 0.0;
for (int i = 0; i < indices.length; i++) {
score += other.get(indices[i]) * values[i];
}
return score;
} else {
throw new IllegalArgumentException("Unknown vector subclass " + other.getClass().getCanonicalName() + " for input");
}
}
/**
* This generates the outer product when dotted with another {@link SparseVector}.
*
* It throws an {@link IllegalArgumentException} if used with a {@link DenseVector}.
*
* @param other A vector.
* @return A {@link DenseSparseMatrix} representing the outer product.
*/
@Override
public Matrix outer(SGDVector other) {
if (other instanceof SparseVector) {
//This horrible mess is why there should be a sparse-sparse matrix type.
SparseVector otherVec = (SparseVector) other;
SparseVector[] output = new SparseVector[size];
int i = 0;
for (VectorTuple tuple : this) {
while (i < tuple.index) {
output[i] = new SparseVector(other.size(), new int[0], new double[0]);
i++;
}
output[tuple.index] = otherVec.scale(tuple.value);
i++;
}
while (i < output.length) {
output[i] = new SparseVector(other.size(), new int[0], new double[0]);
i++;
}
//TODO this is suboptimal if there are lots of missing rows.
return new DenseSparseMatrix(output);
} else if (other instanceof DenseVector) {
throw new IllegalArgumentException("sparse.outer(dense) is currently not implemented.");
} else {
throw new IllegalArgumentException("Unknown vector subclass " + other.getClass().getCanonicalName() + " for input");
}
}
@Override
public double sum() {
double sum = 0.0;
for (int i = 0; i < values.length; i++) {
sum += values[i];
}
return sum;
}
@Override
public double twoNorm() {
double sum = 0.0;
for (int i = 0; i < values.length; i++) {
sum += values[i] * values[i];
}
return Math.sqrt(sum);
}
@Override
public double oneNorm() {
double sum = 0.0;
for (int i = 0; i < values.length; i++) {
sum += Math.abs(values[i]);
}
return sum;
}
@Override
public double get(int index) {
int foundIndex = Arrays.binarySearch(indices, index);
if (foundIndex < 0) {
return 0;
} else {
return values[foundIndex];
}
}
@Override
public void set(int index, double value) {
int foundIndex = Arrays.binarySearch(indices, index);
if (foundIndex < 0) {
throw new IllegalArgumentException("SparseVector cannot have new elements added.");
} else {
values[foundIndex] = value;
}
}
@Override
public int indexOfMax() {
int index = 0;
double value = Double.NEGATIVE_INFINITY;
for (int i = 0; i < values.length; i++) {
double tmp = values[i];
if (tmp > value) {
index = i;
value = tmp;
}
}
return indices[index];
}
@Override
public double maxValue() {
double value = Double.NEGATIVE_INFINITY;
for (int i = 0; i < values.length; i++) {
double tmp = values[i];
if (tmp > value) {
value = tmp;
}
}
return value;
}
@Override
public double minValue() {
double value = Double.POSITIVE_INFINITY;
for (int i = 0; i < values.length; i++) {
double tmp = values[i];
if (tmp < value) {
value = tmp;
}
}
return value;
}
/**
* Generates an array of the indices that are active in this vector
* but are not present in {@code other}.
*
* @param other The vector to compare.
* @return An array of indices that are active only in this vector.
*/
public int[] difference(SparseVector other) {
List diffIndicesList = new ArrayList<>();
if (other.numActiveElements() == 0) {
return Arrays.copyOf(indices,indices.length);
} else if (indices.length == 0) {
return new int[0];
} else {
Iterator itr = iterator();
Iterator otherItr = other.iterator();
VectorTuple tuple = itr.next();
VectorTuple otherTuple = otherItr.next();
while (itr.hasNext() && otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
tuple = itr.next();
otherTuple = otherItr.next();
} else if (tuple.index < otherTuple.index) {
diffIndicesList.add(tuple.index);
tuple = itr.next();
} else {
otherTuple = otherItr.next();
}
}
while (itr.hasNext()) {
if (tuple.index != otherTuple.index) {
diffIndicesList.add(tuple.index);
}
tuple = itr.next();
}
while (otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
break; // break out of loop as we've found the last value.
}
otherTuple = otherItr.next();
}
if (tuple.index != otherTuple.index) {
diffIndicesList.add(tuple.index);
}
}
return Util.toPrimitiveInt(diffIndicesList);
}
/**
* Generates an array of the indices that are active in both this
* vector and {@code other}
*
* @param other The vector to intersect.
* @return An array of indices that are active in both vectors.
*/
public int[] intersection(SparseVector other) {
List diffIndicesList = new ArrayList<>();
Iterator itr = iterator();
Iterator otherItr = other.iterator();
if (itr.hasNext() && otherItr.hasNext()) {
VectorTuple tuple = itr.next();
VectorTuple otherTuple = otherItr.next();
while (itr.hasNext() && otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
diffIndicesList.add(tuple.index);
tuple = itr.next();
otherTuple = otherItr.next();
} else if (tuple.index < otherTuple.index) {
tuple = itr.next();
} else {
otherTuple = otherItr.next();
}
}
while (itr.hasNext()) {
if (tuple.index == otherTuple.index) {
diffIndicesList.add(tuple.index);
}
tuple = itr.next();
}
while (otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
diffIndicesList.add(tuple.index);
}
otherTuple = otherItr.next();
}
if (tuple.index == otherTuple.index) {
diffIndicesList.add(tuple.index);
}
}
return Util.toPrimitiveInt(diffIndicesList);
}
@Override
public void normalize(VectorNormalizer normalizer) {
throw new IllegalStateException("Can't normalize a sparse array");
}
@Override
public double euclideanDistance(SGDVector other) {
return distance(other,(double a) -> a*a, Math::sqrt);
}
@Override
public double l1Distance(SGDVector other) {
return distance(other,Math::abs,DoubleUnaryOperator.identity());
}
public double distance(SGDVector other, DoubleUnaryOperator transformFunc, DoubleUnaryOperator normalizeFunc) {
if (other.size() != size) {
throw new IllegalArgumentException("Can't measure the distance between two vectors of different lengths, this = " + size + ", other = " + other.size());
}
double score = 0.0;
if ((other.numActiveElements() != 0) && (indices.length != 0)){
Iterator itr = iterator();
Iterator otherItr = other.iterator();
VectorTuple tuple = itr.next();
VectorTuple otherTuple = otherItr.next();
while (itr.hasNext() && otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
score += transformFunc.applyAsDouble(tuple.value - otherTuple.value);
tuple = itr.next();
otherTuple = otherItr.next();
} else if (tuple.index < otherTuple.index) {
score += transformFunc.applyAsDouble(tuple.value);
tuple = itr.next();
} else {
score += transformFunc.applyAsDouble(otherTuple.value);
otherTuple = otherItr.next();
}
}
while (itr.hasNext()) {
if (tuple.index == otherTuple.index) {
score += transformFunc.applyAsDouble(tuple.value - otherTuple.value);
otherTuple = new VectorTuple(); // Consumed this value, replace with sentinel
} else {
score += transformFunc.applyAsDouble(tuple.value);
}
tuple = itr.next();
}
while (otherItr.hasNext()) {
if (tuple.index == otherTuple.index) {
score += transformFunc.applyAsDouble(tuple.value - otherTuple.value);
tuple = new VectorTuple(); // Consumed this value, replace with sentinel
} else {
score += transformFunc.applyAsDouble(otherTuple.value);
}
otherTuple = otherItr.next();
}
if (tuple.index == otherTuple.index) {
score += transformFunc.applyAsDouble(tuple.value - otherTuple.value);
} else {
if (tuple.index != -1) {
score += transformFunc.applyAsDouble(tuple.value);
}
if (otherTuple.index != -1) {
score += transformFunc.applyAsDouble(otherTuple.value);
}
}
} else if (indices.length != 0) {
for (VectorTuple tuple : this) {
score += transformFunc.applyAsDouble(tuple.value);
}
} else {
for (VectorTuple tuple : other) {
score += transformFunc.applyAsDouble(tuple.value);
}
}
return normalizeFunc.applyAsDouble(score);
}
@Override
public String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("SparseVector(size=");
buffer.append(size);
buffer.append(",tuples=");
for (int i = 0; i < indices.length; i++) {
buffer.append("[");
buffer.append(indices[i]);
buffer.append(",");
buffer.append(values[i]);
buffer.append("],");
}
buffer.setCharAt(buffer.length() - 1, ')');
return buffer.toString();
}
public double[] toDenseArray() {
double[] output = new double[size];
for (int i = 0; i < values.length; i++) {
output[indices[i]] = values[i];
}
return output;
}
@Override
public double variance(double mean) {
double variance = 0.0;
for (int i = 0; i < values.length; i++) {
variance += (values[i] - mean) * (values[i] - mean);
}
variance += (size - values.length) * mean * mean;
return variance;
}
@Override
public VectorIterator iterator() {
return new SparseVectorIterator(this);
}
private static class SparseVectorIterator implements VectorIterator {
private final SparseVector vector;
private final VectorTuple tuple;
private int index;
public SparseVectorIterator(SparseVector vector) {
this.vector = vector;
this.tuple = new VectorTuple();
this.index = 0;
}
@Override
public boolean hasNext() {
return index < vector.indices.length;
}
@Override
public VectorTuple next() {
if (!hasNext()) {
throw new NoSuchElementException("Off the end of the iterator.");
}
tuple.index = vector.indices[index];
tuple.value = vector.values[index];
index++;
return tuple;
}
@Override
public VectorTuple getReference() {
return tuple;
}
}
/**
* Transposes an array of sparse vectors from row-major to column-major or
* vice versa.
* @param input Input sparse vectors.
* @return A column-major array of SparseVectors.
*/
public static SparseVector[] transpose(SparseVector[] input) {
int firstDimension = input.length;
int secondDimension = input[0].size;
ArrayList> indices = new ArrayList<>();
ArrayList> values = new ArrayList<>();
for (int i = 0; i < secondDimension; i++) {
indices.add(new ArrayList<>());
values.add(new ArrayList<>());
}
for (int i = 0; i < firstDimension; i++) {
for (VectorTuple f : input[i]) {
indices.get(f.index).add(i);
values.get(f.index).add(f.value);
}
}
SparseVector[] output = new SparseVector[secondDimension];
for (int i = 0; i < secondDimension; i++) {
output[i] = new SparseVector(firstDimension,Util.toPrimitiveInt(indices.get(i)),Util.toPrimitiveDouble(values.get(i)));
}
return output;
}
/**
* Converts a dataset of row-major examples into an array of column-major
* sparse vectors.
* @param dataset Input dataset.
* @param The type of the dataset.
* @return A column-major array of SparseVectors.
*/
public static > SparseVector[] transpose(Dataset dataset) {
ImmutableFeatureMap fMap = dataset.getFeatureIDMap();
return transpose(dataset,fMap);
}
/**
* Converts a dataset of row-major examples into an array of column-major
* sparse vectors.
* @param dataset Input dataset.
* @param fMap The feature map to use. If it's different to the feature map used by the dataset then behaviour is undefined.
* @param The type of the dataset.
* @return A column-major array of SparseVectors.
*/
public static > SparseVector[] transpose(Dataset dataset, ImmutableFeatureMap fMap) {
if (dataset.getFeatureMap().size() != fMap.size()) {
throw new IllegalArgumentException(
"The dataset's internal feature map and the supplied feature map have different sizes. dataset = "
+ dataset.getFeatureMap().size() + ", fMap = " + fMap.size());
}
int numExamples = dataset.size();
int numFeatures = fMap.size();
ArrayList> indices = new ArrayList<>();
ArrayList> values = new ArrayList<>();
for (int i = 0; i < numFeatures; i++) {
indices.add(new ArrayList<>());
values.add(new ArrayList<>());
}
int j = 0;
for (Example e : dataset) {
for (Feature f : e) {
int index = fMap.getID(f.getName());
indices.get(index).add(j);
values.get(index).add(f.getValue());
}
j++;
}
SparseVector[] output = new SparseVector[numFeatures];
for (int i = 0; i < fMap.size(); i++) {
output[i] = new SparseVector(numExamples,Util.toPrimitiveInt(indices.get(i)),Util.toPrimitiveDouble(values.get(i)));
}
return output;
}
}