All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.campagnelab.dl.somatic.mappers.MagnitudeFeatures3 Maven / Gradle / Ivy

package org.campagnelab.dl.somatic.mappers;

import org.campagnelab.dl.framework.mappers.FeatureCalculator;
import org.campagnelab.dl.somatic.genotypes.BaseGenotypeCountFactory;
import org.campagnelab.dl.somatic.genotypes.GenotypeCountFactory;
import org.campagnelab.dl.varanalysis.protobuf.BaseInformationRecords;
import org.nd4j.linalg.api.ndarray.INDArray;

/**
 * This is a simple feature mapper. It is designed using information currently available in the Parquet file.
 * Each position has the following information:
 * 
 69033640	11	false
 * position=14521   referenceIndex=0       isMutated=false
 * sample 0 counts=[10, 0, 0, 63, 0, 4, 0, 1, 62, 0]
 * sample 1 counts=[2, 0, 0, 45, 0, 3, 0, 0, 68, 0]
 *
 * position=14521   referenceIndex=0       isMutated=true
 * sample 0 counts=[10, 0, 0, 63, 0, 4, 0, 1, 62, 0]
 * sample 1 counts=[2, 11, 0, 34, 0, 3, 12, 0, 56, 0]
 * 
*

* Using these data, we can map to features as follows: *

    *
  • Make the isMutated boolean the only label. This is not ideal: the net may tell us if the base is mutated, * but we will not know what the mutation is..
  • *
  • Concatenate the count integers and use these as features. The only way for the net to learn from these data is to count * the number of counts elements that has "enough" reads to call a genotype. If the genotype calls are more than two, then * the site is likely mutated, because most sites will be heterozygous at most. With these features, I expect the * net to have more trouble predicting mutations at homozygous sites, than at heterozygous sites. We'll see.
* Created by fac2003 on 5/21/16. * * @author Fabien Campagne, Remi Torracinta */ public class MagnitudeFeatures3 extends AbstractFeatureMapper implements FeatureCalculator { public MagnitudeFeatures3(){ } @Override public int numberOfFeatures() { // we need features for the normal sample and for the tumor sample: return AbstractFeatureMapper.MAX_GENOTYPES * 2 * 2 + 1; } public void prepareToNormalize(BaseInformationRecords.BaseInformationOrBuilder record, int indexOfRecord) { indices[0] = indexOfRecord; } @Override public int numberOfLabels() { return 2; } int[] indices = new int[]{0, 0}; @Override public void mapFeatures(BaseInformationRecords.BaseInformationOrBuilder record, INDArray inputs, int indexOfRecord) { indices[0] = indexOfRecord; prepareToNormalize(record, indexOfRecord); for (int featureIndex = 0; featureIndex < numberOfFeatures(); featureIndex++) { indices[1] = featureIndex; inputs.putScalar(indices, produceFeature(record, featureIndex)); } } public float produceFeature(BaseInformationRecords.BaseInformationOrBuilder record, int featureIndex) { return normalize(produceFeatureInternal(record, featureIndex),1); } @Override public void mapLabels(BaseInformationRecords.BaseInformationOrBuilder record, INDArray labels, int indexOfRecord) { indices[0] = indexOfRecord; for (int labelIndex = 0; labelIndex < numberOfLabels(); labelIndex++) { indices[1] = labelIndex; labels.putScalar(indices, produceLabel(record, labelIndex)); } } private float normalize(float value, int normalizationFactor) { float normalized = 1 / (float) (value + 1); assert normalized >= 0 && normalized <= 1 : "value must be normalized: " + normalized; return normalized; } public float produceFeatureInternal(BaseInformationRecords.BaseInformationOrBuilder record, int featureIndex) { assert (featureIndex >= 0 && featureIndex < AbstractFeatureMapper.MAX_GENOTYPES * 2 * 2 + 1) : "Only MAX_GENOTYPES*2*2 + 1 features"; if (featureIndex == AbstractFeatureMapper.MAX_GENOTYPES * 2 * 2){ int sum = 0; for (GenotypeCount baseCount : getAllCounts(record,false,true) ){ sum += baseCount.totalCount(); } for (GenotypeCount baseCount : getAllCounts(record,true,true) ){ sum += baseCount.totalCount(); } return sum; } if (featureIndex < AbstractFeatureMapper.MAX_GENOTYPES * 2) { // germline counts written first: if ((featureIndex % 2) == 1) { // odd featureIndices are forward strand: return getAllCounts(record, false, true).get(featureIndex / 2).forwardCount; } else { return getAllCounts(record, false, true).get(featureIndex / 2).reverseCount; } } else { // tumor counts written next: featureIndex -= AbstractFeatureMapper.MAX_GENOTYPES * 2; if ((featureIndex % 2) == 1) { // odd featureIndices are forward strand: return getAllCounts(record, true, true).get(featureIndex / 2).forwardCount; } else { return getAllCounts(record, true, true).get(featureIndex / 2).reverseCount; } } } @Override public float produceLabel(BaseInformationRecords.BaseInformationOrBuilder record, int labelIndex) { assert labelIndex == 0 || labelIndex == 1 : "only one label."; //return record.getMutated() ? 1.0f : 0.0f; if (labelIndex == 0) return record.getMutated() ? 1 : 0; else { return !record.getMutated() ? 1 : 0; } } @Override protected void initializeCount(BaseInformationRecords.CountInfo sampleCounts, GenotypeCount count) { // nothing to do, already done in the base class. } @Override protected GenotypeCountFactory getGenotypeCountFactory() { return new BaseGenotypeCountFactory(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy