
org.campagnelab.dl.somatic.mappers.ReadIndexFeaturesFix Maven / Gradle / Ivy
package org.campagnelab.dl.somatic.mappers;
import org.campagnelab.dl.framework.mappers.FeatureMapper;
import org.campagnelab.dl.somatic.genotypes.GenotypeCountFactory;
import org.campagnelab.dl.somatic.utils.ProtoPredictor;
import org.campagnelab.dl.somatic.genotypes.BaseGenotypeCountFactory;
import org.campagnelab.dl.varanalysis.protobuf.BaseInformationRecords;
import org.nd4j.linalg.api.ndarray.INDArray;
/**
* Estimates the number of distinct read indices per base position.
* Created by fac2003 on 6/3/16.
*
* @author Fabien Campagne
*/
public class ReadIndexFeaturesFix extends AbstractFeatureMapper
implements FeatureMapper
{
public static final int NUM_GENOTYPES = 5;
public static final int NUM_SAMPLES = 2;
@Override
public int numberOfFeatures() {
// only one feature per genotype: the number of distinct read indices in each sample. 5 genotypes max.
return NUM_SAMPLES * NUM_GENOTYPES;
}
public static final int MAX_GENOTYPES = 5;
int sumReadIndex;
/**
* normalization is ndone by taking the inverse of the number of distinct read indices.
* @param record The record to convert to features & labels.
* @param indexOfRecord Index of the record in the destination dataset.
*/
public void prepareToNormalize(BaseInformationRecords.BaseInformationOrBuilder record, int indexOfRecord) {
// indices[0] = indexOfRecord;
// sumReadIndex = 0;
// for (int featureIndex = 0; featureIndex < numberOfFeatures(); featureIndex++) {
// sumReadIndex += produceFeatureInternal(record, featureIndex);
// }
}
int[] indices = new int[]{0, 0};
@Override
public void mapFeatures(BaseInformationRecords.BaseInformationOrBuilder record, INDArray inputs, int indexOfRecord) {
indices[0] = indexOfRecord;
prepareToNormalize(record, indexOfRecord);
for (int featureIndex = 0; featureIndex < numberOfFeatures(); featureIndex++) {
indices[1] = featureIndex;
inputs.putScalar(indices, produceFeature(record, featureIndex));
}
}
public float produceFeature(BaseInformationRecords.BaseInformationOrBuilder record, int featureIndex) {
return normalize(produceFeatureInternal(record, featureIndex), sumReadIndex);
}
@Override
public String getFeatureName(int featureIndex) {
assert featureIndex >= 0 && featureIndex < MAX_GENOTYPES * 2: "Only MAX_GENOTYPES*2*2 features";
if (featureIndex < MAX_GENOTYPES) {
return "numReadIdxsGermlineCount"+(featureIndex);
} else {
featureIndex -= MAX_GENOTYPES;
return "numReadIdxsSomaticCount"+(featureIndex);
}
}
private float normalize(float value, int normalizationFactor) {
// produce a value between zero and 1, increasingly smaller as the number of distinct read indices increases.
// plus one is to prevent division by zero for genotypes without any read.
return 1f / (value+1);
/*if (normalizationFactor == 0) {
return 0;
}
float normalized = value / normalizationFactor;
assert normalized >= 0 && normalized <= 1 : "value must be normalized: " + normalized;
return normalized;*/
}
public float produceFeatureInternal(BaseInformationRecords.BaseInformationOrBuilder record, int featureIndex) {
assert featureIndex >= 0 && featureIndex < MAX_GENOTYPES * 2: "Only MAX_GENOTYPES*2 features";
if (featureIndex < MAX_GENOTYPES) {
// germline counts written first:
final ReadIndexWithCounts genotypeCount = (ReadIndexWithCounts) getAllCounts(record, false, true).get(featureIndex);
return genotypeCount.getDistinctReadIndices();
} else {
// tumor counts written next:
featureIndex -= MAX_GENOTYPES;
final ReadIndexWithCounts genotypeCount = (ReadIndexWithCounts) getAllCounts(record, true, true).get(featureIndex);
return genotypeCount.getDistinctReadIndices();
}
}
public boolean oneSampleHasTumor(java.util.List samples) {
for (BaseInformationRecords.SampleInfo sample : samples) {
if (sample.getIsTumor()) return true;
}
return false;
}
@Override
protected void initializeCount(BaseInformationRecords.CountInfo sampleCounts, GenotypeCount count) {
ReadIndexWithCounts myCounts = (ReadIndexWithCounts) count;
myCounts.set(ProtoPredictor.expandFreq(sampleCounts.getReadIndicesForwardStrandList()),
ProtoPredictor.expandFreq(sampleCounts.getReadIndicesReverseStrandList()));
}
@Override
protected GenotypeCountFactory getGenotypeCountFactory() {
return new BaseGenotypeCountFactory() {
@Override
public GenotypeCount create() {
return new ReadIndexWithCounts();
}
};
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy