All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.genomicsdb.SampleNameMap Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.genomicsdb;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

/**
 * A class to hold the mappings of sample names to VCF / VCF index paths. Used by GenomicsDBImport.
 *
 * This class can be constructed from a textual file containing lines in the format:
 *
 * Sample\tVCF
 * or:
 * Sample\tVCF\tIndex
 *
 * The sample names may have internal whitespace, but not leading/trailing whitespace.
 * The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
 *
 * The third Index column is optional. It is permitted to specify the index for some samples
 * and not others. If an index is not specified for a sample, its location is inferred from
 * the VCF URI.
 *
 * It is also possible to construct an empty SampleNameMap using the no-arg constructor, and
 * add sample mappings one at a time using addSample().
 */
public final class SampleNameMap {
    // Sorted mapping between sample names and corresponding GVCF file name
    //
    // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database.
    // This happens because the callset json is generated independently from the import process
    // each imported batch is then sorted, so if we have an unsorted list we'll end up with different
    // global vs batch sorting.
    // We preemptively sort here so we will have consistent sorting.
    private SortedMap sampleNameToVcfPath;

    // Mapping between sample names and corresponding VCF index path
    //
    // This Map contains only indices specified explicitly via the sample name map file.
    // If an explicit index is not specified for a given sample, it will not have an
    // entry in this Map, and the index path will be automatically inferred based on
    // the location of the VCF.
    //
    // The ordering of the entries in this Map does not actually matter, since it's not
    // directly exposed, and is used only for individual lookups via getVCFIndexForSample()
    private SortedMap sampleNameToVcfIndexPath;

    /**
     * Create an empty SampleNameMap. Samples can be added later using addSample()
     */
    public SampleNameMap() {
        sampleNameToVcfPath = new TreeMap<>();
        sampleNameToVcfIndexPath = new TreeMap<>();
    }

    /**
     * Create a SampleNameMap from a textual file containing the sample mappings. The
     * lines in this file must be in the format:
     *
     * Sample\tVCF
     * or:
     * Sample\tVCF\tIndex
     *
     * The sample names may have internal whitespace, but not leading/trailing whitespace.
     * The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
     *
     * The third Index column is optional. It is permitted to specify the index for some samples
     * and not others. If an index is not specified for a sample, its location is inferred from
     * the VCF URI.
     *
     * @param sampleMapFilePath Path to the file containing the sample name mappings to load
     */
    public SampleNameMap(final Path sampleMapFilePath) {
        this(sampleMapFilePath, false);
    }

    /**
     * Create a SampleNameMap from a textual file containing the sample mappings. The
     * lines in this file must be in the format:
     *
     * SampleName1\tVCF
     * or:
     * SampleName1\tVCF\tIndex
     *
     * The sample names may have internal whitespace, but not leading/trailing whitespace.
     * The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
     *
     * The third Index column is optional. It is permitted to specify the index for some samples
     * and not others. If an index is not specified for a sample, its location is inferred from
     * the VCF URI.
     *
     * @param sampleMapFilePath Path to the file containing the sample name mappings to load
     * @param checkVcfIsCompressedAndIndexed If true, check each VCF to make sure it's compressed and indexed
     */
    public SampleNameMap(final Path sampleMapFilePath, final boolean checkVcfIsCompressedAndIndexed) {
        sampleNameToVcfPath = new TreeMap<>();
        sampleNameToVcfIndexPath = new TreeMap<>();

        loadSampleNameMapFile(sampleMapFilePath, checkVcfIsCompressedAndIndexed);
    }

    private void loadSampleNameMapFile(final Path sampleToFileMapPath, final boolean checkVcfIsCompressedAndIndexed) {
        try {
            final List lines = Files.readAllLines(sampleToFileMapPath);
            if (lines.isEmpty()) {
                throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file");
            }

            for (final String line : lines) {
                final String[] split = line.split("\\t",-1);
                if (split.length != 2 && split.length != 3) {
                    throw new UserException.BadInput("Sample name map file must have 2 or 3 fields per line in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\nbut found line: \""
                            + line +"\" with "+split.length+" fields");
                }
                if ( ! sampleNameIsLegal(split[0]) || split[1].trim().isEmpty()) {
                    throw new UserException.BadInput("Sample name map file must have lines in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace");
                }
                final String sample = split[0];
                final String vcfPath = split[1].trim();

                String vcfIndexPath = null;
                if ( split.length == 3 ) {
                    vcfIndexPath = split[2].trim();

                    if ( vcfIndexPath.isEmpty() ) {
                        throw new UserException.BadInput("Found a line in the sample name map file with an empty or all-whitespace value for the index:\n" + "\"" + line + "\"");
                    }
                }

                try {
                    final URI existingVCFPath = sampleNameToVcfPath.put(sample, new URI(vcfPath));
                    if (existingVCFPath != null){
                        throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + vcfPath + "\n" + existingVCFPath);
                    }

                    if ( vcfIndexPath != null ) {
                        final URI existingVCFIndexPath = sampleNameToVcfIndexPath.put(sample, new URI(vcfIndexPath));
                        if (existingVCFIndexPath != null) {
                            throw new UserException.BadInput("Found two indices for the same sample: " + sample + "\n" + vcfIndexPath + "\n" + existingVCFIndexPath);
                        }
                    }

                    if (checkVcfIsCompressedAndIndexed) {
                        GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(vcfPath), vcfIndexPath == null ? null : IOUtils.getPath(vcfIndexPath));
                    }
                }
                catch(final URISyntaxException e) {
                    throw new UserException("Malformed URI: " + e.toString());
                }
            }
        } catch (final IOException e) {
            throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file",  e);
        }
    }

    /**
     * Tests whether the sample name is legal. Sample names must be non-empty, and
     * may have internal whitespace but not leading/trailing whitespace.
     *
     * @param sampleName sample name to test
     * @return true if sampleName is legal, otherwise false
     */
    private boolean sampleNameIsLegal(final String sampleName) {
        return sampleName != null &&
                ! sampleName.trim().isEmpty() &&
                sampleName.trim().equals(sampleName);
    }

    /**
     * Add a new sample mapping
     *
     * @param sampleName name of the sample
     * @param vcfPath path to the VCF for the sample
     */
    public void addSample(final String sampleName, final URI vcfPath) {
        addSample(sampleName, vcfPath, null);
    }

    /**
     * Add a new sample mapping
     *
     * @param sampleName name of the sample
     * @param vcfPath path to the VCF for the sample (not null)
     * @param vcfIndexPath path to the index for the sample (may be null)
     */
    public void addSample(final String sampleName, final URI vcfPath, final URI vcfIndexPath) {
        if ( ! sampleNameIsLegal(sampleName) ) {
            throw new UserException.BadInput("Sample name " + sampleName + " is not legal. Sample names must be non-empty and not contain leading or trailing whitespace");
        }
        if ( vcfPath == null ) {
            throw new UserException.BadInput("VCF path for sample " + sampleName + " was null");
        }

        final URI previousPath = sampleNameToVcfPath.put(sampleName, vcfPath);
        if (previousPath != null) {
            throw new UserException.BadInput("Duplicate sample: " + sampleName + ". Sample was found in both "
                    + vcfPath + " and " + previousPath + ".");
        }

        if (vcfIndexPath != null) {
            final URI previousIndexPath = sampleNameToVcfIndexPath.put(sampleName, vcfIndexPath);
            if (previousIndexPath != null) {
                throw new UserException.BadInput("For sample " + sampleName + ", attempted to specify multiple indices: " + vcfIndexPath + " and " + previousIndexPath);
            }
        }
    }

    /**
     * @return The full mapping of sample names -> VCF paths, with the sample names in sorted order
     */
    public SortedMap getSampleNameToVcfPath() {
        return sampleNameToVcfPath;
    }

    /**
     * @param sample sample name
     * @return the VCF associated with that sample name, as a URI
     */
    public URI getVCFForSample(final String sample) {
        return sampleNameToVcfPath.get(sample);
    }

    /**
     * @param sample sample name
     * @return the VCF associated with that sample name, as a Path
     */
    public Path getVCFForSampleAsPath(final String sample) {
        final URI vcfURI = sampleNameToVcfPath.get(sample);
        return vcfURI == null ? null : IOUtils.getPath(vcfURI.toString());
    }

    /**
     * @param sample sample name
     * @return the VCF index associated with that sample name, as a URI, or null if no index
     */
    public URI getVCFIndexForSample(final String sample) {
        return sampleNameToVcfIndexPath.get(sample);
    }

    /**
     * @param sample sample name
     * @return the VCF index associated with that sample name, as a Path, or null if no index
     */
    public Path getVCFIndexForSampleAsPath(final String sample) {
        final URI vcfIndexURI = sampleNameToVcfIndexPath.get(sample);
        return vcfIndexURI == null ? null : IOUtils.getPath(vcfIndexURI.toString());
    }

    /**
     * @return number of samples in this Map
     */
    public int getNumSamples() {
        return sampleNameToVcfPath.size();
    }

    /**
     * @return a List of the sample names in this Map in sorted order
     */
    public List getSampleNamesInSortedOrder() {
        return new ArrayList<>(sampleNameToVcfPath.keySet());
    }

    /**
     * @return true if an index was specified for at least one sample, otherwise false
     */
    public boolean indicesSpecified() {
        return ! sampleNameToVcfIndexPath.isEmpty();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy