All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.samples.SampleDBBuilder Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.samples;

import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Class for creating a temporary in memory database of samples.
 */
public class SampleDBBuilder {
    private final PedigreeValidationType validationStrictness;
    private final SampleDB sampleDB = new SampleDB();

    private final Set samplesFromDataSources = new LinkedHashSet<>();
    private final Set samplesFromPedigrees = new LinkedHashSet<>();

    public SampleDBBuilder(PedigreeValidationType validationStrictness) {
        this.validationStrictness = validationStrictness;
    }

    public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) {
        for (final GATKPath pedFile : pedigreeFiles) {
            Collection samples = addSamplesFromPedigreeArgument(pedFile);
            samplesFromPedigrees.addAll(samples);
        }

        return this;
    }

    public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) {
        for (final String pedString : pedigreeStrings) {
            Collection samples = addSamplesFromPedigreeArgument(pedString);
            samplesFromPedigrees.addAll(samples);
        }

        return this;
    }

    /**
     * Parse one sample file and integrate it with samples that are already there
     * Fail quickly if we find any errors in the file
     */
    private Collection addSamplesFromPedigreeArgument(GATKPath sampleFile) {
        try (final InputStream is = sampleFile.getInputStream();
             final InputStreamReader isr = new InputStreamReader(is)) {
            return new PedReader().parse(isr, getMissingFields(sampleFile), sampleDB);
        } catch (IOException e) {
            throw new UserException.CouldNotReadInputFile(sampleFile, "Could not read sample file", e);
        }
    }

    /**
     * Integrates the collection of sample names with the samples already present
     */
    public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) {
        Utils.nonNull(sampleNames);
        for (final String sampleName : sampleNames) {
            if (sampleDB.getSample(sampleName) == null) {
                final Sample newSample = new Sample(sampleName, null, null, null, Sex.UNKNOWN);
                sampleDB.addSample(newSample);
                samplesFromDataSources.add(newSample); // keep track of data source samples
            }
        }
        return this;
    }

    private Collection addSamplesFromPedigreeArgument(final String string) {
        final PedReader reader = new PedReader();
        return reader.parse(string, getMissingFields(string), sampleDB);
    }

    public SampleDB getFinalSampleDB() {
        validate();
        return sampleDB;
    }

    private EnumSet getMissingFields(final Object engineArg) {
        return EnumSet.noneOf(PedReader.MissingPedField.class);
    }

    // --------------------------------------------------------------------------------
    //
    // Validation
    //
    // --------------------------------------------------------------------------------

    private void validate() {
        validatePedigreeIDUniqueness();
        if (validationStrictness != PedigreeValidationType.SILENT) {
            // check that samples in data sources are all annotated, if anything is annotated
            if (!samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty()) {
                final Set sampleNamesFromPedigrees = samplesFromPedigrees.stream().map(Sample::getID).collect(Collectors.toSet());

                for (final Sample dsSample : samplesFromDataSources)
                    if (!sampleNamesFromPedigrees.contains(dsSample.getID())) {
                        throw new UserException("Sample " + dsSample.getID()
                                + " found in data sources but not in pedigree files with STRICT pedigree validation");
                    }
            }
        }
    }

    private void validatePedigreeIDUniqueness() {
        final Set pedigreeIDs = samplesFromPedigrees.stream().map(Sample::getID).collect(Collectors.toSet());
        assert pedigreeIDs.size() == samplesFromPedigrees.size() :
                "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?";
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy