de.charite.compbio.jannovar.cmd.annotate_vcf.AnnotateVCFCommand Maven / Gradle / Ivy

Go to download
package de.charite.compbio.jannovar.cmd.annotate_vcf;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import de.charite.compbio.jannovar.Jannovar;
import de.charite.compbio.jannovar.JannovarException;
import de.charite.compbio.jannovar.cmd.CommandLineParsingException;
import de.charite.compbio.jannovar.cmd.JannovarAnnotationCommand;
import de.charite.compbio.jannovar.cmd.annotate_vcf.JannovarAnnotateVCFOptions.BedAnnotationOptions;
import de.charite.compbio.jannovar.filter.facade.*;
import de.charite.compbio.jannovar.filter.impl.var.VariantThresholdFilterAnnotator;
import de.charite.compbio.jannovar.hgvs.AminoAcidCode;
import de.charite.compbio.jannovar.htsjdk.VariantContextAnnotator;
import de.charite.compbio.jannovar.htsjdk.VariantContextWriterConstructionHelper;
import de.charite.compbio.jannovar.htsjdk.VariantEffectHeaderExtender;
import de.charite.compbio.jannovar.mendel.IncompatiblePedigreeException;
import de.charite.compbio.jannovar.mendel.bridge.MendelVCFHeaderExtender;
import de.charite.compbio.jannovar.mendel.filter.*;
import de.charite.compbio.jannovar.pedigree.*;
import de.charite.compbio.jannovar.progress.GenomeRegionListFactoryFromSAMSequenceDictionary;
import de.charite.compbio.jannovar.progress.ProgressReporter;
import de.charite.compbio.jannovar.vardbs.base.DBAnnotationOptions;
import de.charite.compbio.jannovar.vardbs.base.DBAnnotationOptions.MultipleMatchBehaviour;
import de.charite.compbio.jannovar.vardbs.facade.DBVariantContextAnnotator;
import de.charite.compbio.jannovar.vardbs.facade.DBVariantContextAnnotatorFactory;
import de.charite.compbio.jannovar.vardbs.generic_tsv.GenericTSVAnnotationDriver;
import de.charite.compbio.jannovar.vardbs.generic_tsv.GenericTSVAnnotationOptions;
import de.charite.compbio.jannovar.vardbs.generic_tsv.GenericTSVAnnotationTarget;
import de.charite.compbio.jannovar.vardbs.generic_tsv.GenericTSVValueColumnDescription;
import de.charite.compbio.jannovar.vardbs.generic_vcf.GenericVCFAnnotationDriver;
import de.charite.compbio.jannovar.vardbs.generic_vcf.GenericVCFAnnotationOptions;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.Interval;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import net.sourceforge.argparse4j.inf.Namespace;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Run annotation steps (read in VCF, write out VCF or Jannovar file format).
 *
 * @author Manuel Holtgrewe
 * @author Max Schubach
 */
public class AnnotateVCFCommand extends JannovarAnnotationCommand {

	/**
	 * Raw command line arguments
	 */
	private String[] argv = null;

	/**
	 * Progress reporting
	 */
	private ProgressReporter progressReporter = null;

	/**
	 * Configuration
	 */
	private JannovarAnnotateVCFOptions options;

	public AnnotateVCFCommand(String[] argv, Namespace args) throws CommandLineParsingException {
		this.argv = argv;
		this.options = new JannovarAnnotateVCFOptions();
		this.options.setFromArgs(args);
	}

	/**
	 * This function inputs a VCF file, and prints the annotated version thereof to a file (name of the original file
	 * with the suffix .de.charite.compbio.jannovar).
	 *
	 * @throws JannovarException on problems with the annotation
	 */
	@Override
	public void run() throws JannovarException {
		System.err.println("Options");
		System.err.println(options.toString());

		System.err.println("Deserializing transcripts...");
		deserializeTranscriptDefinitionFile(options.getDatabaseFilePath());

		final String vcfPath = options.getPathInputVCF();

		// whether or not to require availability of an index
		final boolean useInterval = (options.getInterval() != null
			&& !options.getInterval().equals(""));

		try (VCFFileReader vcfReader = new VCFFileReader(new File(vcfPath), useInterval)) {
			if (this.options.getVerbosity() >= 1) {
				final SAMSequenceDictionary seqDict = VCFFileReader
					.getSequenceDictionary(new File(vcfPath));
				if (seqDict != null) {
					final GenomeRegionListFactoryFromSAMSequenceDictionary factory = new GenomeRegionListFactoryFromSAMSequenceDictionary();
					this.progressReporter = new ProgressReporter(factory.construct(seqDict), 60);
					this.progressReporter.printHeader();
					this.progressReporter.start();
				} else {
					System.err.println(
						"Progress reporting does not work because VCF file is missing the contig "
							+ "lines in the header.");
				}
			}

			VCFHeader vcfHeader = vcfReader.getFileHeader();

			System.err.println("Annotating VCF...");
			final long startTime = System.nanoTime();

			// Jump to interval if given, otherwise start at beginning
			CloseableIterator iter;
			if (useInterval) {
				Interval itv = RegionParser.parse(options.getInterval());
				int end = 1000 * 1000 * 1000; // "some large number"
				for (VCFContigHeaderLine line : vcfHeader.getContigLines()) {
					if (line.getID().equals(itv.getContig()))
						end = line.getSAMSequenceRecord().getSequenceLength();
				}
				if (itv.getStart() == 0 && itv.getEnd() == 0)
					itv = new Interval(itv.getContig(), 1, end);
				iter = vcfReader.query(itv.getContig(), itv.getStart(), itv.getEnd());
				System.err.println("Will read interval " + itv.toString());
			} else {
				System.err.println("Will read full input file");
				iter = vcfReader.iterator();
			}

			// Obtain Java 8 stream from iterator
			Stream stream = iter.stream();

			// If configured, annotate using dbSNP VCF file (extend header to
			// use for writing out)
			if (options.pathVCFDBSNP != null) {
				DBAnnotationOptions dbSNPOptions = DBAnnotationOptions.createDefaults();
				dbSNPOptions.setIdentifierPrefix(options.prefixDBSNP);
				DBVariantContextAnnotator dbSNPAnno = new DBVariantContextAnnotatorFactory()
					.constructDBSNP(options.pathVCFDBSNP, options.pathFASTARef, dbSNPOptions);
				dbSNPAnno.extendHeader(vcfHeader);
				stream = stream.map(dbSNPAnno::annotateVariantContext);
			}

			// If configured, annotate using ExAC VCF file (extend header to use
			// for writing out)
			if (options.pathVCFExac != null) {
				DBAnnotationOptions exacOptions = DBAnnotationOptions.createDefaults();
				exacOptions.setIdentifierPrefix(options.prefixExac);
				DBVariantContextAnnotator exacAnno = new DBVariantContextAnnotatorFactory()
					.constructExac(options.pathVCFExac, options.pathFASTARef, exacOptions);
				exacAnno.extendHeader(vcfHeader);
				stream = stream.map(exacAnno::annotateVariantContext);
			}

			// If configured, annotate using gnomAD exomes VCF file (extend
			// header to use for
			// writing out)
			if (options.pathVCFGnomadExomes != null) {
				DBAnnotationOptions gnomadOptions = DBAnnotationOptions.createDefaults();
				gnomadOptions.setIdentifierPrefix(options.prefixGnomadExomes);
				DBVariantContextAnnotator gnomadExomesAnno = new DBVariantContextAnnotatorFactory()
					.constructGnomad(options.pathVCFGnomadExomes, options.pathFASTARef,
						gnomadOptions);
				gnomadExomesAnno.extendHeader(vcfHeader);
				stream = stream.map(gnomadExomesAnno::annotateVariantContext);
			}

			// If configured, annotate using gnomAD genomes VCF file (extend
			// header to use for
			// writing out)
			if (options.pathVCFGnomadGenomes != null) {
				DBAnnotationOptions gnomadOptions = DBAnnotationOptions.createDefaults();
				gnomadOptions.setIdentifierPrefix(options.prefixGnomadGenomes);
				DBVariantContextAnnotator gnomadGenomesAnno = new DBVariantContextAnnotatorFactory()
					.constructGnomad(options.pathVCFGnomadGenomes, options.pathFASTARef,
						gnomadOptions);
				gnomadGenomesAnno.extendHeader(vcfHeader);
				stream = stream.map(gnomadGenomesAnno::annotateVariantContext);
			}

			// If configured, annotate using thousand genomes VCF file (extend
			// header to use for writing out)
			if (options.pathThousandGenomes != null) {
				DBAnnotationOptions thousandGenomesOptions = DBAnnotationOptions.createDefaults();
				thousandGenomesOptions.setIdentifierPrefix(options.prefixThousandGenomes);
				DBVariantContextAnnotator thousandGenomesAnno = new DBVariantContextAnnotatorFactory()
					.constructThousandGenomes(options.pathThousandGenomes, options.pathFASTARef,
						thousandGenomesOptions);
				thousandGenomesAnno.extendHeader(vcfHeader);
				stream = stream.map(thousandGenomesAnno::annotateVariantContext);
			}

			// If configured, annotate using UK10K VCF file (extend header to
			// use for writing out)
			if (options.pathVCFUK10K != null) {
				DBAnnotationOptions exacOptions = DBAnnotationOptions.createDefaults();
				exacOptions.setIdentifierPrefix(options.prefixUK10K);
				DBVariantContextAnnotator uk10kAnno = new DBVariantContextAnnotatorFactory()
					.constructUK10K(options.pathVCFUK10K, options.pathFASTARef, exacOptions);
				uk10kAnno.extendHeader(vcfHeader);
				stream = stream.map(uk10kAnno::annotateVariantContext);
			}

			// If configured, annotate using ClinVar VCF file (extend header to
			// use for writing out)
			if (options.pathClinVar != null) {
				DBAnnotationOptions clinVarOptions = DBAnnotationOptions.createDefaults();
				clinVarOptions.setIdentifierPrefix(options.prefixClinVar);
				DBVariantContextAnnotator clinvarAnno = new DBVariantContextAnnotatorFactory()
					.constructClinVar(options.pathClinVar, options.pathFASTARef,
						clinVarOptions);
				clinvarAnno.extendHeader(vcfHeader);
				stream = stream.map(clinvarAnno::annotateVariantContext);
			}

			// If configured, annotate using COSMIC VCF file (extend header to
			// use for writing out)
			if (options.pathCosmic != null) {
				DBAnnotationOptions cosmicOptions = DBAnnotationOptions.createDefaults();
				cosmicOptions.setIdentifierPrefix(options.prefixCosmic);
				DBVariantContextAnnotator cosmicAnno = new DBVariantContextAnnotatorFactory()
					.constructCosmic(options.pathCosmic, options.pathFASTARef, cosmicOptions);
				cosmicAnno.extendHeader(vcfHeader);
				stream = stream.map(cosmicAnno::annotateVariantContext);
			}

			// Add step for annotating with variant effect
			VariantEffectHeaderExtender extender = new VariantEffectHeaderExtender();
			extender.addHeaders(vcfHeader);
			VariantContextAnnotator variantEffectAnnotator = new VariantContextAnnotator(refDict,
				chromosomeMap,
				new VariantContextAnnotator.Options(!options.isShowAll(),
					(options.isUseThreeLetterAminoAcidCode() ? AminoAcidCode.THREE_LETTER
						: AminoAcidCode.ONE_LETTER),
					options.isEscapeAnnField(), options.isNt3PrimeShifting(),
					options.isOffTargetFilterEnabled(),
					options.isOffTargetFilterUtrIsOffTarget(),
					options.isOffTargetFilterIntronicSpliceIsOffTarget()));
			stream = stream.map(variantEffectAnnotator::annotateVariantContext);

			// If configured, use threshold-based annotation (extend header to
			// use for writing out)
			ArrayList affecteds = new ArrayList<>();
			if (options.useThresholdFilters) {
				// Build options object for threshold filter
				ThresholdFilterOptions thresholdFilterOptions = new ThresholdFilterOptions(
					options.getThreshFiltMinGtCovHet(), options.getThreshFiltMinGtCovHomAlt(),
					options.getThreshFiltMaxCov(), options.getThreshFiltMinGtGq(),
					options.getThreshFiltMinGtAafHet(), options.getThreshFiltMaxGtAafHet(),
					options.getThreshFiltMinGtAafHomAlt(),
					options.getThreshFiltMaxGtAafHomRef(), options.getPrefixExac(),
					options.getPrefixDBSNP(), options.getPrefixGnomadGenomes(),
					options.getPrefixGnomadExomes(),
					options.getPrefixThousandGenomes(),
					options.getThreshFiltMaxAlleleFrequencyAd(),
					options.getThreshFiltMaxAlleleFrequencyAr(),
					options.getThreshFiltMaxExacHomAlt(),
					options.getThreshFiltMaxThousandGenomesHomAlt());
				// Add headers
				new ThresholdFilterHeaderExtender(thresholdFilterOptions).addHeaders(vcfHeader);
				// Build list of affecteds; take from pedigree file if given.
				// Otherwise, assume one single individual is always affected and otherwise warn
				// about missing pedigree.
				if (options.pathPedFile == null) {
					if (vcfHeader.getNGenotypeSamples() == 1) {
						System.err.println(
							"INFO: No pedigree file given and single individual. Assuming it is affected for the threshold filter");
					} else {
						System.err.println(
							"WARNING: no pedigree file given. Threshold filter will not annotate FILTER field, only genotype FT");
					}
				} else {
					Pedigree pedigree;
					try {
						pedigree = loadPedigree(vcfHeader);
					} catch (IOException e) {
						System.err.println("Problem loading pedigree from " + options.pathPedFile);
						System.err.println(e.getMessage());
						System.err.println("\n");
						e.printStackTrace(System.err);
						return;
					}
					for (Person person : pedigree.getMembers()) {
						if (person.isAffected())
							affecteds.add(person.getName());
					}
					if (affecteds.isEmpty()) {
						System.err.println(
							"WARNING: no affected individual in pedigree. Threshold filter will not modify FILTER field, "
								+ "only genotype FT");
					}
				}
				GenotypeThresholdFilterAnnotator gtThresholdFilterAnno = new GenotypeThresholdFilterAnnotator(
					thresholdFilterOptions);
				stream = stream.map(gtThresholdFilterAnno::annotateVariantContext);

				// When configured to use advanced pedigree filters (must come
				// after threshold-based filtration)
				if (options.useAdvancedPedigreeFilters) {
					// Build options object from configuration and extend headers
					PedigreeFilterOptions pedFilterOptions = new PedigreeFilterOptions(
						options.getThreshDeNovoParentAd2(), options.isUseParentGtIsFiltered(),
						options.isOneParentGtFilteredFiltersAffected());
					new PedigreeFilterHeaderExtender(pedFilterOptions).addHeaders(vcfHeader);

					// Load pedigree
					Pedigree pedigree;
					try {
						pedigree = loadPedigree(vcfHeader);
					} catch (IOException e) {
						System.err.println("Problem loading pedigree from " + options.pathPedFile);
						System.err.println(e.getMessage());
						System.err.println("\n");
						e.printStackTrace(System.err);
						return;
					}

					// Construct annotator and register with pipeline
					PedigreeFilterAnnotator pedFilterAnnotator = new PedigreeFilterAnnotator(
						pedFilterOptions, pedigree);
					stream = stream.map(pedFilterAnnotator::annotateVariantContext);
				}

				if (options.useThresholdFilters) {
					VariantThresholdFilterAnnotator varThresholdFilterAnno = new VariantThresholdFilterAnnotator(
						thresholdFilterOptions, affecteds);
					stream = stream.map(varThresholdFilterAnno::annotateVariantContext);
				}
			}

			// Annotate from BED files
			List bedFileAnnotators = new ArrayList<>();
			for (BedAnnotationOptions bedAnnotationOptions : options.getBedAnnotationOptions()) {
				BedFileAnnotator annotator = new BedFileAnnotator(bedAnnotationOptions);
				bedFileAnnotators.add(annotator);
				annotator.extendHeader(vcfHeader);
				stream = stream.map(annotator::annotateVariantContext);
			}

			// Annotate using dbNSFP
			GenericTSVAnnotationDriver dbNsfpAnnotator;
			if (options.getPathDbNsfp() != null) {
				Map descriptions = new HashMap<>();
				for (String colName : options.getColumnsDbNsfp()) {
					descriptions.put(colName, DbNsfpFields.DBNSFP_FIELDS.get(colName));
				}
				GenericTSVAnnotationOptions dbNsfpAnnotationOptions = new GenericTSVAnnotationOptions(
					true, false, options.getPrefixDbNsfp(), MultipleMatchBehaviour.BEST_ONLY,
					new File(options.getPathDbNsfp()), GenericTSVAnnotationTarget.VARIANT, true,
					options.getDbNsfpColContig(), options.getDbNsfpColPosition(),
					options.getDbNsfpColPosition(), 3, 4, false, options.getColumnsDbNsfp(),
					descriptions);
				dbNsfpAnnotator = new GenericTSVAnnotationDriver(options.getPathFASTARef(),
					dbNsfpAnnotationOptions);
				dbNsfpAnnotator.constructVCFHeaderExtender().addHeaders(vcfHeader);
				stream = stream.map(dbNsfpAnnotator::annotateVariantContext);
			}

			// Annotate from generic TSV files
			List tsvAnnotators = new ArrayList<>();
			for (GenericTSVAnnotationOptions tsvAnnotationOptions : options
				.getTsvAnnotationOptions()) {
				GenericTSVAnnotationDriver annotator = new GenericTSVAnnotationDriver(
					options.getPathFASTARef(), tsvAnnotationOptions);
				tsvAnnotators.add(annotator);
				annotator.constructVCFHeaderExtender().addHeaders(vcfHeader);
				stream = stream.map(annotator::annotateVariantContext);
			}

			// Annotate from generic VCF files
			List vcfAnnotators = new ArrayList<>();
			for (GenericVCFAnnotationOptions vcfAnnotationOptions : options
				.getVcfAnnotationOptions()) {
				GenericVCFAnnotationDriver annotator = new GenericVCFAnnotationDriver(
					vcfAnnotationOptions.getPathVcfFile(), options.getPathFASTARef(),
					vcfAnnotationOptions);
				vcfAnnotators.add(annotator);
				annotator.constructVCFHeaderExtender().addHeaders(vcfHeader);
				stream = stream.map(annotator::annotateVariantContext);
			}

			// Extend header with INHERITANCE filter
			if (options.pathPedFile != null || options.annotateAsSingletonPedigree) {
				System.err.println("Extending header with INHERITANCE...");
				new MendelVCFHeaderExtender().extendHeader(vcfHeader, "");
			}

			// Create VCF output writer
			ImmutableList jvHeaderLines = ImmutableList.of(
				new VCFHeaderLine("jannovarVersion", Jannovar.getVersion()),
				new VCFHeaderLine("jannovarCommand", Joiner.on(' ').join(argv)));

			// Construct VariantContextWriter and start annotationg pipeline
			try (VariantContextWriter vcfWriter = VariantContextWriterConstructionHelper
				.openVariantContextWriter(vcfHeader, options.getPathOutputVCF(), jvHeaderLines);
				 VariantContextProcessor sink = buildMendelianProcessors(vcfWriter, vcfHeader)) {
				// Make current VC available to progress printer
				if (this.progressReporter != null)
					stream = stream.peek(vc -> this.progressReporter.setCurrentVC(vc));

				stream.forEachOrdered(sink::put);
			} catch (IOException e) {
				throw new JannovarException("Problem opening file", e);
			}

			System.err.println("Wrote annotations to \"" + options.getPathOutputVCF() + "\"");
			final long endTime = System.nanoTime();
			System.err.println(String.format("Annotation and writing took %.2f sec.",
				(endTime - startTime) / 1000.0 / 1000.0 / 1000.0));
		} catch (IncompatiblePedigreeException e) {
			if (options.pathPedFile != null)
				System.err.println("VCF file " + vcfPath + " is not compatible to pedigree file "
					+ options.pathPedFile);
			else
				System.err.println("VCF file " + vcfPath
					+ " is not compatible with singleton pedigree annotation (do you have exactly one sample in VCF file?)");
			System.err.println(e.getMessage());
			System.err.println("\n");
			e.printStackTrace(System.err);
		} catch (VariantContextFilterException e) {
			System.err.println("There was a problem annotating the VCF file");
			System.err.println("The error message was as follows.  The stack trace below the error "
				+ "message can help the developers debug the problem.\n");
			System.err.println(e.getMessage());
			System.err.println("\n");
			e.printStackTrace(System.err);
			return;
		}

		if (progressReporter != null)
			progressReporter.done();
	}

	/**
	 * Load pedigree from file given in configuration or construct singleton pedigree
	 *
	 * @param vcfHeader {@link VCFHeader}, for checking compatibility and getting sample name in case of singleton
	 *                  pedigree construction
	 * @throws PedParseException in the case of problems with parsing pedigrees
	 */
	private Pedigree loadPedigree(VCFHeader vcfHeader)
		throws PedParseException, IOException, IncompatiblePedigreeException {
		if (options.pathPedFile != null) {
			final PedFileReader pedReader = new PedFileReader(new File(options.pathPedFile));
			final PedFileContents pedContents = pedReader.read();
			return new Pedigree(pedContents, pedContents.getIndividuals().get(0).getPedigree());
		} else {
			if (vcfHeader.getSampleNamesInOrder().size() != 1)
				throw new IncompatiblePedigreeException(
					"VCF file does not have exactly one sample but required for singleton pedigree construction");
			final String sampleName = vcfHeader.getSampleNamesInOrder().get(0);
			final PedPerson pedPerson = new PedPerson(sampleName, sampleName, "0", "0", Sex.UNKNOWN,
				Disease.AFFECTED);
			final PedFileContents pedContents = new PedFileContents(ImmutableList.of(),
				ImmutableList.of(pedPerson));
			return new Pedigree(pedContents, pedContents.getIndividuals().get(0).getPedigree());
		}
	}

	/**
	 * Construct the mendelian inheritance annotation processors
	 *
	 * @param writer    the place to put put the VariantContext to after filtration
	 * @param vcfHeader {@link VCFHeader}, for checking compatibility and getting sample name in case of singleton
	 *                  pedigree construction
	 * @throws IOException                   in case of problems with opening the pedigree file
	 * @throws PedParseException             in the case of problems with parsing pedigrees
	 * @throws IncompatiblePedigreeException If the pedigree is incompatible with the VCF file
	 */
	private VariantContextProcessor buildMendelianProcessors(VariantContextWriter writer,
															 VCFHeader vcfHeader)
		throws PedParseException, IOException, IncompatiblePedigreeException {
		if (options.pathPedFile != null || options.annotateAsSingletonPedigree) {
			final Pedigree pedigree = loadPedigree(vcfHeader);
			checkPedigreeCompatibility(pedigree, vcfHeader);
			final GeneWiseMendelianAnnotationProcessor mendelProcessor = new GeneWiseMendelianAnnotationProcessor(
				pedigree, jannovarData, vc -> writer.add(vc),
				options.isInheritanceAnnoUseFilters());
			return new CoordinateSortingChecker(mendelProcessor);
		} else {
			return new ConsumerProcessor(vc -> writer.add(vc));
		}
	}

	/**
	 * Check pedigree for compatibility
	 *
	 * @param pedigree  {@link Pedigree} to check for compatibility
	 * @param vcfHeader {@link VCFHeader} to check for compatibility
	 * @throws IncompatiblePedigreeException if the VCF file is not compatible with the pedigree
	 */
	private void checkPedigreeCompatibility(Pedigree pedigree, VCFHeader vcfHeader)
		throws IncompatiblePedigreeException {
		List missing = vcfHeader.getGenotypeSamples().stream()
			.filter(x -> !pedigree.getNames().contains(x)).collect(Collectors.toList());
		if (!missing.isEmpty())
			throw new IncompatiblePedigreeException(
				"The VCF file has the following sample names not present in Pedigree: "
					+ Joiner.on(", ").join(missing));
	}

}