
org.broadinstitute.hellbender.utils.samples.SampleDBBuilder Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.utils.samples;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.stream.Collectors;
/**
* Class for creating a temporary in memory database of samples.
*/
public class SampleDBBuilder {
private final PedigreeValidationType validationStrictness;
private final SampleDB sampleDB = new SampleDB();
private final Set samplesFromDataSources = new LinkedHashSet<>();
private final Set samplesFromPedigrees = new LinkedHashSet<>();
public SampleDBBuilder(PedigreeValidationType validationStrictness) {
this.validationStrictness = validationStrictness;
}
public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) {
for (final GATKPath pedFile : pedigreeFiles) {
Collection samples = addSamplesFromPedigreeArgument(pedFile);
samplesFromPedigrees.addAll(samples);
}
return this;
}
public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) {
for (final String pedString : pedigreeStrings) {
Collection samples = addSamplesFromPedigreeArgument(pedString);
samplesFromPedigrees.addAll(samples);
}
return this;
}
/**
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
private Collection addSamplesFromPedigreeArgument(GATKPath sampleFile) {
try (final InputStream is = sampleFile.getInputStream();
final InputStreamReader isr = new InputStreamReader(is)) {
return new PedReader().parse(isr, getMissingFields(sampleFile), sampleDB);
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(sampleFile, "Could not read sample file", e);
}
}
/**
* Integrates the collection of sample names with the samples already present
*/
public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) {
Utils.nonNull(sampleNames);
for (final String sampleName : sampleNames) {
if (sampleDB.getSample(sampleName) == null) {
final Sample newSample = new Sample(sampleName, null, null, null, Sex.UNKNOWN);
sampleDB.addSample(newSample);
samplesFromDataSources.add(newSample); // keep track of data source samples
}
}
return this;
}
private Collection addSamplesFromPedigreeArgument(final String string) {
final PedReader reader = new PedReader();
return reader.parse(string, getMissingFields(string), sampleDB);
}
public SampleDB getFinalSampleDB() {
validate();
return sampleDB;
}
private EnumSet getMissingFields(final Object engineArg) {
return EnumSet.noneOf(PedReader.MissingPedField.class);
}
// --------------------------------------------------------------------------------
//
// Validation
//
// --------------------------------------------------------------------------------
private void validate() {
validatePedigreeIDUniqueness();
if (validationStrictness != PedigreeValidationType.SILENT) {
// check that samples in data sources are all annotated, if anything is annotated
if (!samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty()) {
final Set sampleNamesFromPedigrees = samplesFromPedigrees.stream().map(Sample::getID).collect(Collectors.toSet());
for (final Sample dsSample : samplesFromDataSources)
if (!sampleNamesFromPedigrees.contains(dsSample.getID())) {
throw new UserException("Sample " + dsSample.getID()
+ " found in data sources but not in pedigree files with STRICT pedigree validation");
}
}
}
}
private void validatePedigreeIDUniqueness() {
final Set pedigreeIDs = samplesFromPedigrees.stream().map(Sample::getID).collect(Collectors.toSet());
assert pedigreeIDs.size() == samplesFromPedigrees.size() :
"The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy