All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.engine.FeatureManager Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.engine;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentDefinition;
import org.broadinstitute.barclay.argparser.ClassFinder;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
import org.broadinstitute.hellbender.utils.config.GATKConfig;

import java.io.File;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;


/**
 * Handles discovery of available codecs and Feature arguments, file format detection and codec selection,
 * and creation/management/querying of FeatureDataSources for each source of Features.
 *
 * At startup, walks the packages specified in {@link GATKConfig#codec_packages} in the config file to discover what codecs are available
 * to decode Feature-containing files.
 *
 * Then, given a tool instance, it discovers what FeatureInput argument fields are declared in the
 * tool's class hierarchy (and associated ArgumentCollections), and for each argument actually specified
 * by the user on the command line, determines the type of the file and the codec required to decode it,
 * creates a FeatureDataSource for that file, and adds it to a query-able resource pool.
 *
 * Clients can then call {@link #getFeatures(FeatureInput, SimpleInterval)} to query the data source for
 * a particular FeatureInput over a specific interval.
 */
public final class FeatureManager implements AutoCloseable {
    private static final Logger logger = LogManager.getLogger(FeatureManager.class);

    /**
     * All codecs descend from this class
     */
    private static final Class CODEC_BASE_CLASS = FeatureCodec.class;

    /**
     * The codec classes we locate when searching codec packages
     */
    private static final Set> DISCOVERED_CODECS;

    /**
     * Feature arguments in tools are of this type
     */
    private static final Class FEATURE_ARGUMENT_CLASS = FeatureInput.class;

    /**
     * At startup, walk through the packages in codec packages, and save any (concrete) FeatureCodecs discovered
     * in DISCOVERED_CODECS
     */
    static {

        // Get our configuration:
        final GATKConfig config = ConfigFactory.getInstance().getGATKConfig();

        final ClassFinder finder = new ClassFinder();
        for ( final String codecPackage : config.codec_packages() ) {
            finder.find(codecPackage, CODEC_BASE_CLASS);
        }
        // Exclude abstract classes and interfaces from the list of discovered codec classes
        DISCOVERED_CODECS = Collections.unmodifiableSet(finder.getConcreteClasses());
    }

    /**
     * The simple class name of the tool instance containing the FeatureInput argument values that will form the basis of our
     * pool of FeatureDataSources
     */
    private final String toolInstanceSimpleClassName;

    /**
     * Mapping from FeatureInput argument to query-able FeatureDataSource for that source of Features
     */
    private final Map, FeatureDataSource> featureSources;

    /**
     * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
     * arguments in the tool and creating query-able FeatureDataSources for them. Uses the default
     * caching behavior of {@link FeatureDataSource}.
     *
     * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
     *                     Must have undergone command-line argument parsing and argument value injection already.
     */
    public FeatureManager( final CommandLineProgram toolInstance ) {
        this(toolInstance, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES);
    }

    /**
     * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
     * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over
     * how much caching is performed by each {@link FeatureDataSource}.
     *
     * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
     *                     Must have undergone command-line argument parsing and argument value injection already.
     * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
     *                              the end of query intervals in anticipation of future queries (>= 0).
     */
    public FeatureManager( final CommandLineProgram toolInstance, final int featureQueryLookahead ) {
        this(toolInstance, featureQueryLookahead, 0, 0);
    }


    /**
     * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
     * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over
     * how much caching is performed by each {@link FeatureDataSource}.
     *
     * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
     *                     Must have undergone command-line argument parsing and argument value injection already.
     * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
     *                              the end of query intervals in anticipation of future queries (>= 0).
     * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     *
     */
    public FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) {
        this(toolInstance, featureQueryLookahead, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, null);
    }

    /**
     * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
     * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over
     * how much caching is performed by each {@link FeatureDataSource}.
     *  @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
     *                     Must have undergone command-line argument parsing and argument value injection already.
     * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
     *                              the end of query intervals in anticipation of future queries (>= 0).
     * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     * @param gdbOptions settings for GenomicsDB to use when reading from a GenomicsDB workspace
     *
     */
    public FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions) {
        this.toolInstanceSimpleClassName = toolInstance.getClass().getSimpleName();
        this.featureSources = new LinkedHashMap<>();

        initializeFeatureSources(featureQueryLookahead, toolInstance, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, gdbOptions);
    }

    /**
     * Same as {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}, except used when the
     *  FeatureInputs (and associated types) are known.
     *
     *  This constructor should only be used in test code.
     *
     * @param featureInputsToTypeMap {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}.  Never {@code null}
     * @param toolInstanceName See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
     * @param featureQueryLookahead See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
     * @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
     * @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
     * @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
     */
    @VisibleForTesting
    FeatureManager(final Map, Class> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference) {

        Utils.nonNull(featureInputsToTypeMap);

        this.toolInstanceSimpleClassName = toolInstanceName;
        this.featureSources = new LinkedHashMap<>();
        Utils.nonNull(featureInputsToTypeMap);
        featureInputsToTypeMap.forEach((k,v) -> addToFeatureSources(featureQueryLookahead, k, v, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference));
    }

    /**
     * Given our tool instance, discover all argument of type FeatureInput (or Collections thereof), determine
     * the type of each Feature-containing file, and add a FeatureDataSource for each file to our query pool.
     *
     * @param featureQueryLookahead Set up each FeatureDataSource to cache this many extra bases of context beyond
     *                              the end of query intervals in anticipation of future queries (>= 0).
     * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
     *                     Must have undergone command-line argument parsing and argument value injection already.
     * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     */
    @SuppressWarnings({"unchecked", "rawtypes"})
    private void initializeFeatureSources( final int featureQueryLookahead, final CommandLineProgram toolInstance, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions) {

        // Discover all arguments of type FeatureInput (or Collections thereof) in our tool's class hierarchy
        // (and associated ArgumentCollections). Arguments not specified by the user on the command line will
        // come back to us with a null FeatureInput.
        final List> featureArgumentValues =
                toolInstance.getCommandLineParser().gatherArgumentValuesOfType(FEATURE_ARGUMENT_CLASS);

        for ( final Pair featureArgument : featureArgumentValues ) {
            final FeatureInput featureInput = featureArgument.getValue();

            // Only create a data source for Feature arguments that were actually specified
            if ( featureInput != null ) {
                final Class featureType = getFeatureTypeForFeatureInputArgument(featureArgument.getKey());
                addToFeatureSources(featureQueryLookahead, featureInput, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
                        gdbOptions);
            }
        }
    }

    @SuppressWarnings({"unchecked", "rawtypes"})
    public void dumpAllFeatureCacheStats() {
        for ( final FeatureDataSource f : featureSources.values() ) {
            f.printCacheStats();
        }
    }

    void addToFeatureSources(final int featureQueryLookahead, final FeatureInput featureInput,
                             final Class featureType, final int cloudPrefetchBuffer,
                             final int cloudIndexPrefetchBuffer, final Path reference) {
        // Create a new FeatureDataSource for this file, and add it to our query pool
        featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, new GenomicsDBOptions(reference)));
    }

    /**
     * Add the feature data source to the given feature input.
     *
     * @param featureQueryLookahead look ahead this many bases during queries that produce cache misses
     * @param featureInput source of features
     * @param featureType class of features
     * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     * @param genomicsDBOptions options and info for reading from a GenomicsDB
     *
     * Note: package-visible to enable access from the core walker classes
     * (but not actual tools, so it's not protected).
     */
    void addToFeatureSources(final int featureQueryLookahead, final FeatureInput featureInput,
                             final Class featureType, final int cloudPrefetchBuffer,
                             final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions) {
        // Create a new FeatureDataSource for this file, and add it to our query pool
        featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, genomicsDBOptions));
    }

    /**
     * Given a ArgumentDefinition for an argument known to be of type FeatureInput (or a Collection thereof), retrieves the type
     * parameter for the FeatureInput (eg., for FeatureInput or List>
     * this would be VariantContext).
     *
     * @param argDef an {@code ArgumentDefinition} for an argument known to be of type FeatureInput whose type parameter to retrieve
     * @return type parameter of the FeatureInput declaration represented by the given ArgumentDefinition
     */
    @SuppressWarnings("unchecked")
    static Class getFeatureTypeForFeatureInputArgument( final ArgumentDefinition argDef ) {
        final Type featureInputType = argDef.isCollection() ?
                                getNextTypeParameter((ParameterizedType)(argDef.getUnderlyingField().getGenericType())) :
                                argDef.getUnderlyingField().getGenericType();

        if ( ! (featureInputType instanceof ParameterizedType) ) {
            throw new GATKException(String.format("FeatureInput declaration for argument --%s lacks an explicit type parameter for the Feature type",
                                argDef.getUnderlyingField().getAnnotation(Argument.class).fullName()));
        }

        return (Class)getNextTypeParameter((ParameterizedType)featureInputType);
    }

    /**
     * Helper method for {@link #getFeatureTypeForFeatureInputArgument(ArgumentDefinition)} that "unpacks" a
     * parameterized type by one level of parameterization. Eg., given List>
     * would return FeatureInput.
     *
     * @param parameterizedType parameterized type to unpack
     * @return the type parameter of the given parameterized type
     */
    private static Type getNextTypeParameter( final ParameterizedType parameterizedType ) {
        final Type[] typeParameters = parameterizedType.getActualTypeArguments();
        if ( typeParameters.length != 1 ) {
            throw new GATKException("Found a FeatureInput declaration with multiple type parameters, which is not supported");
        }
        return typeParameters[0];
    }

    /**
     * Does this manager have no sources of Features to query?
     *
     * @return true if there are no Feature sources available to query, otherwise false
     */
    public boolean isEmpty() {
        return featureSources.isEmpty();
    }


    /**
     * This method finds and returns all of the variant headers from the feature sources.
     *
     * @return A list of all variant headers for features.
     */
    public List getAllVariantHeaders() {
        return featureSources.values().stream()
                .map(feature -> feature.getHeader())
                .filter(header -> header instanceof VCFHeader)
                .map(header -> (VCFHeader)header).collect(Collectors.toList());
    }

    /**
     * Returns the list of sequence dictionaries retrieved from the VCF headers of variant Feature inputs.
     * Note: this method returns an empty list if the variant inputs
     * happen not to have sequence dictionaries (since they are optional in the VCF format).
     */
    public List getVariantSequenceDictionaries() {
        return getAllVariantHeaders()
                .stream().map(h -> h.getSequenceDictionary())
                .filter(dict -> dict != null)
                .collect(Collectors.toList());
    }

    /**
     * Returns the sequence dictionaries associated with all feature sources.
     * This method will return an empty List if none of the feature sources have dictionaries.
     */
    public List getAllSequenceDictionaries() {
        return featureSources.values().stream().map(fs -> fs.getSequenceDictionary())
                .filter(dict -> dict != null)
                .collect(Collectors.toList());
    }

    /**
     * Given a FeatureInput argument field from our tool, queries the data source for that FeatureInput
     * over the specified interval, and returns a List of the Features overlapping that interval from
     * that data source.
     *
     * Will throw an exception if the provided FeatureInput did not come from the tool that this
     * FeatureManager was initialized with, or was not an @Argument-annotated field in the tool
     * (or parent classes).
     *
     * @param featureDescriptor FeatureInput argument from our tool representing the Feature source to query
     * @param interval interval to query over (returned Features will overlap this interval)
     * @param  type of Feature in the source represented by featureDescriptor
     * @return A List of all Features in the backing data source for the provided FeatureInput that overlap
     *         the provided interval (may be empty if there are none, but never null)
     */
    public  List getFeatures( final FeatureInput featureDescriptor, final Locatable interval ) {
        final FeatureDataSource dataSource = lookupDataSource(featureDescriptor);

        // No danger of a ClassCastException here, since we verified that the FeatureDataSource for this
        // FeatureInput will return Features of the expected type T when we first created the data source
        // in initializeFeatureSources()
        return dataSource.queryAndPrefetch(interval);
    }

    /**
     * Given a FeatureInput argument field from our tool, returns an iterator to its features starting
     * from the first one.
     * 

Warning!: calling this method a second time on the same {@link FeatureInput} * on the same FeatureManager instance will invalidate (close) the iterator returned from * the first call. *

*

* An exception will be thrown if the {@link FeatureInput} provided did not come from the tool that this * manager was initialized with, or was not an @Argument-annotated field in the tool * (or parent classes). *

* * @param featureDescriptor FeatureInput argument from our tool representing the Feature source to query * @param type of Feature in the source represented by featureDescriptor * @return never {@code null}, a iterator to all the features in the backing data source. * @throws GATKException if the feature-descriptor is not found in the manager or is {@code null}. */ public Iterator getFeatureIterator(final FeatureInput featureDescriptor) { final FeatureDataSource dataSource = lookupDataSource(featureDescriptor); return dataSource.iterator(); } /** * Get the header associated with a particular FeatureInput * * @param featureDescriptor the FeatureInput whose header we want to retrieve * @param type of Feature in our FeatureInput * @return header for the provided FeatureInput */ public Object getHeader( final FeatureInput featureDescriptor ) { final FeatureDataSource dataSource = lookupDataSource(featureDescriptor); return dataSource.getHeader(); } /** * Retrieve the data source for a particular FeatureInput. Throws an exception if the provided * FeatureInput is not among our discovered sources of Features. * * @param featureDescriptor FeatureInput whose data source to retrieve * @param type of Feature in our FeatureInput * @return query-able data source for the provided FeatureInput, if it was found */ private FeatureDataSource lookupDataSource( final FeatureInput featureDescriptor ) { @SuppressWarnings("unchecked") final FeatureDataSource dataSource = (FeatureDataSource)featureSources.get(featureDescriptor); // Make sure the provided FeatureInput actually came from our tool as an @Argument-annotated field if ( dataSource == null ) { throw new GATKException(String.format("FeatureInput %s not found in feature manager's database for tool %s. " + "In order to be detected, FeatureInputs must be declared in the tool class " + "itself, a superclass of the tool class, or an @ArgumentCollection declared " + "in the tool class or a superclass. They must also be annotated as an @Argument.", featureDescriptor.getName(), toolInstanceSimpleClassName)); } return dataSource; } /** * Utility method that determines the correct codec to use to read Features from the provided file. * * Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method * in order to be considered as candidates for decoding the file. * * Throws an exception if no suitable codecs are found (this is a user error, since the file is of * an unsupported format), or if more than one codec claims to be able to decode the file (this is * a configuration error on the codec authors' part). * * @param featurePath path for which to find the right codec * @return the codec suitable for decoding the provided file */ public static FeatureCodec getCodecForFile( final Path featurePath ) { return getCodecForFile(featurePath, null); } /** * Utility method that determines the correct codec to use to read Features from the provided file, * optionally considering only codecs that produce a particular type of Feature. * * Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method * in order to be considered as candidates for decoding the file, and must produce * Features of the specified type if featureType is non-null. * * Throws an exception if no suitable codecs are found (this is a user error, since the file is of * an unsupported format), or if more than one codec claims to be able to decode the file (this is * a configuration error on the codec authors' part). * * @param featurePath Path for which to find the right codec * @param featureType If specified, consider only codecs that produce Features of this type. May be null, * in which case all codecs are considered. * @return the codec suitable for decoding the provided file */ public static FeatureCodec getCodecForFile( final Path featurePath, final Class featureType ) { // Make sure Path exists/is readable if ( ! Files.isReadable(featurePath) ) { throw new UserException.CouldNotReadInputFile(featurePath.toUri().toString()); } // Gather all discovered codecs that claim to be able to decode the given file according to their // canDecode() methods final List> candidateCodecs = getCandidateCodecsForFile(featurePath); // If no codecs can handle the file, it's a user error (the user provided a file in an unsupported format) if ( candidateCodecs.isEmpty() ) { throw new UserException.NoSuitableCodecs(featurePath); } // If featureType was specified, subset to only codecs that produce the requested type of Feature, // and throw an error if there are no such codecs. if ( featureType != null ) { final List discoveredCodecsFeatureTypes = candidateCodecs.stream().map(codec -> codec.getFeatureType().getSimpleName()).collect(Collectors.toList()); candidateCodecs.removeIf(codec -> ! featureType.isAssignableFrom(codec.getFeatureType())); if ( candidateCodecs.isEmpty() ) { throw new UserException.WrongFeatureType(featurePath, featureType, discoveredCodecsFeatureTypes); } } // If we still have multiple candidate codecs, it's a configuration error on the part of the codec authors if ( candidateCodecs.size() > 1 ) { final StringBuilder multiCodecMatches = new StringBuilder(); for ( FeatureCodec candidateCodec : candidateCodecs ) { multiCodecMatches.append(candidateCodec.getClass().getCanonicalName()); multiCodecMatches.append(' '); } throw new GATKException("Multiple codecs found able to decode file " + featurePath.toAbsolutePath().toUri() + ". This indicates a misconfiguration on the part of the codec authors. " + "Matching codecs are: " + multiCodecMatches.toString()); } final FeatureCodec selectedCodec = candidateCodecs.get(0); logger.info("Using codec " + selectedCodec.getClass().getSimpleName() + " to read file " + featurePath.toAbsolutePath().toUri()); return selectedCodec; } /** * Returns a List of all codecs in DISCOVERED_CODECS that claim to be able to decode the specified file * according to their {@link FeatureCodec#canDecode(String)} methods. * * @param featureFile file for which to find potential codecs * @return A List of all codecs in DISCOVERED_CODECS for which {@link FeatureCodec#canDecode(String)} returns true on the specified file */ private static List> getCandidateCodecsForFile( final Path featureFile ) { final List> candidateCodecs = new ArrayList<>(); for ( final Class codecClass : DISCOVERED_CODECS ) { try { final FeatureCodec codec = (FeatureCodec)codecClass.getDeclaredConstructor().newInstance(); if ( codec.canDecode(featureFile.toAbsolutePath().toUri().toString()) ) { candidateCodecs.add(codec); } } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e ) { throw new GATKException("Unable to automatically instantiate codec " + codecClass.getName()); } } return candidateCodecs; } /** * @param file file to check * @return True if the file exists and contains Features (ie., we have a FeatureCodec that can decode it), otherwise false */ public static boolean isFeatureFile( final Path file ) { return Files.exists(file) && ! getCandidateCodecsForFile(file).isEmpty(); } /** * Permanently closes this manager by closing all backing data sources */ @Override public void close() { featureSources.values().forEach(ds -> ds.close()); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy