com.expleague.ml.data.tools.FeaturesTxtPool Maven / Gradle / Ivy
package com.expleague.ml.data.tools;
import com.expleague.commons.math.vectors.Mx;
import com.expleague.commons.math.vectors.Vec;
import com.expleague.commons.math.vectors.VecTools;
import com.expleague.commons.seq.Seq;
import com.expleague.ml.data.set.VecDataSet;
import com.expleague.ml.meta.FeatureMeta;
import com.expleague.ml.meta.PoolFeatureMeta;
import com.expleague.ml.meta.TargetMeta;
import com.expleague.ml.meta.items.QURLItem;
import com.expleague.commons.util.Pair;
import com.expleague.ml.Vectorization;
import com.expleague.ml.data.set.DataSet;
import com.expleague.ml.data.set.impl.VecDataSetImpl;
import com.expleague.ml.meta.DataSetMeta;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* User: solar
* Date: 07.07.14
* Time: 20:55
*/
// TODO: Why FeaturesTxtPool duplicates FakePool?
public class FeaturesTxtPool extends Pool {
private final Mx data;
public FeaturesTxtPool(final String file, final Seq items, final Mx data, final Vec target) {
super(new FakeDataSetMeta(file), items, genFakeFeatures(data), new Pair[]{Pair.create(new FakeTargetMeta(), target)});
this.data = data;
for (int i = 0; i < features.length; i++) {
final Pair extends FeatureMeta, ? extends Seq>> feature = features[i];
((FakeFeatureMeta)feature.first).owner = this;
}
((FakeTargetMeta)this.targets.get(0).first).owner = this;
((FakeDataSetMeta)meta).owner = this;
}
private static Pair[] genFakeFeatures(final Mx data) {
final List> features = new ArrayList<>();
for (int i = 0; i < data.columns(); i++) {
final int finalI = i;
final PoolFeatureMeta.ValueType type = VecTools.isSparse(data.col(i), 0.1) ? PoolFeatureMeta.ValueType.SPARSE_VEC : PoolFeatureMeta.ValueType.VEC;
features.add(Pair.create(new FakeFeatureMeta(finalI, type), type == PoolFeatureMeta.ValueType.VEC ? data.col(i) : VecTools.copySparse(data.col(i))));
}
//noinspection unchecked
return features.toArray(new Pair[features.size()]);
}
//dataset is immutable
private VecDataSet lazyVecData = null;
@Override
public VecDataSet vecData() {
if (lazyVecData == null) {
final DataSet ds = data();
lazyVecData = new VecDataSetImpl(ds, data, new Vectorization() {
@Override
public Vec value(final QURLItem subject) {
return data.row(ds.index(subject));
}
@Override
public FeatureMeta meta(final int findex) {
return features[findex].first;
}
@Override
public int dim() {
return data.columns();
}
});
}
return lazyVecData;
}
@Override
public boolean equals(final Object obj) {
if (obj == this)
return true;
if (obj == null || obj.getClass() != getClass())
return false;
final FeaturesTxtPool other = (FeaturesTxtPool) obj;
return new EqualsBuilder().appendSuper(super.equals(obj)).append(data, other.data).isEquals();
}
@Override
public int hashCode() {
return new HashCodeBuilder().appendSuper(super.hashCode()).append(data).toHashCode();
}
private static class FakeFeatureMeta implements PoolFeatureMeta {
private final int finalI;
private final ValueType type;
private Pool owner;
public FakeFeatureMeta(final int finalI, final ValueType type) {
this.finalI = finalI;
this.type = type;
}
@Override
public String id() {
return "Fake-" + finalI;
}
@Override
public String description() {
return "Fake feature from features.txt format #" + finalI;
}
@Override
public ValueType type() {
return type;
}
@Override
public DataSet associated() {
return owner.data();
}
}
private static class FakeTargetMeta implements TargetMeta {
private Pool owner;
@Override
public String id() {
return "whoknowsthefakeid";
}
@Override
public String description() {
return "fake relevance marks";
}
@Override
public ValueType type() {
return ValueType.VEC;
}
@Override
public DataSet associated() {
return owner.data();
}
}
private static class FakeDataSetMeta implements DataSetMeta {
private final String file;
protected Date creationDate;
private Pool owner;
public FakeDataSetMeta(final String file) {
this.file = file;
creationDate = new Date(0);
}
@Override
public String id() {
return "qurls";
}
@Override
public String source() {
return "/dev/random";
}
@Override
public String author() {
return "/dev/null";
}
@Override
public Pool owner() {
return owner;
}
@Override
public Class type() { return QURLItem.class; }
@Override
public Date created() {
return creationDate;
}
}
}