All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.vcf.LiftoverVcf Maven / Gradle / Ivy

Go to download

A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.

There is a newer version: 3.2.0
Show newest version
package picard.vcf;

/*
 * The MIT License
 *
 * Copyright (c) 2017 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.liftover.LiftOver;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.*;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.argumentcollections.ReferenceArgumentCollection;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import picard.util.LiftoverUtils;

import java.io.File;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 

Summary

* Tool for "lifting over" a VCF from one genome build to another, producing a properly headered, * sorted and indexed VCF in one go. * *

Details

* This tool adjusts the coordinates of variants within a VCF file to match a new reference. The * output file will be sorted and indexed using the target reference build. To be clear, {@link #REFERENCE_SEQUENCE} should be the * target reference build (that is, the "new" one). The tool is based on the UCSC LiftOver tool * and uses a UCSC chain file to guide its operation.
* * For each variant, the tool will look for the target coordinate, reverse-complement and left-align the variant if needed, * and, in the case that the reference and alternate alleles of a SNP have been swapped in the new genome build, it will * adjust the SNP, and correct AF-like INFO fields and the relevant genotypes. *
* *

Example

*
 * java -jar picard.jar LiftoverVcf \\
 *     I=input.vcf \\
 *     O=lifted_over.vcf \\
 *     CHAIN=b37tohg38.chain \\
 *     REJECT=rejected_variants.vcf \\
 *     R=reference_sequence.fasta
 * 
*

Caveats

*

Rejected Records

* Records may be rejected because they cannot be lifted over or because of sequence incompatibilities between the * source and target reference genomes. Rejected records will be emitted to the {@link #REJECT} file using the source * genome build coordinates. The reason for the rejection will be stated in the FILTER field, and more detail may be placed * in the INFO field. *

Memory Use

* LiftOverVcf sorts the output using a {@link htsjdk.samtools.util.SortingCollection} which relies on {@link #MAX_RECORDS_IN_RAM} * to specify how many (vcf) records to hold in memory before "spilling" to disk. The default value is reasonable when sorting SAM files, * but not for VCFs as there is no good default due to the dependence on the number of samples and amount of information in the INFO and FORMAT * fields. Consider lowering to 100,000 or even less if you have many genotypes. * * @author Tim Fennell */ @CommandLineProgramProperties( summary = LiftoverVcf.USAGE_SUMMARY + LiftoverVcf.USAGE_DETAILS, oneLineSummary = LiftoverVcf.USAGE_SUMMARY, programGroup = VariantManipulationProgramGroup.class) @DocumentedFeature public class LiftoverVcf extends CommandLineProgram { static final String USAGE_SUMMARY = "Lifts over a VCF file from one reference build to another. "; static final String USAGE_DETAILS = "

Summary

\n" + "Tool for \"lifting over\" a VCF from one genome build to another, producing a properly headered, " + "sorted and indexed VCF in one go.\n" + "\n" + "

Details

\n" + "This tool adjusts the coordinates of variants within a VCF file to match a new reference. The " + "output file will be sorted and indexed using the target reference build. To be clear, REFERENCE_SEQUENCE should be the " + "target reference build (that is, the \"new\" one). The tool is based on the UCSC LiftOver tool (see http://genome.ucsc.edu/cgi-bin/hgLiftOver) " + "and uses a UCSC chain file to guide its operation.\n" + "\n" + "For each variant, the tool will look for the target coordinate, reverse-complement and left-align the variant if needed, " + "and, in the case that the reference and alternate alleles of a SNP have been swapped in the new genome build, it will " + "adjust the SNP, and correct AF-like INFO fields and the relevant genotypes." + "\n" + "\n" + "

Example

\n" + "java -jar picard.jar LiftoverVcf \\\n" + " I=input.vcf \\\n" + " O=lifted_over.vcf \\\n" + " CHAIN=b37tohg38.chain \\\n" + " REJECT=rejected_variants.vcf \\\n" + " R=reference_sequence.fasta\n" + "\n" + "

Caveats

\n" + "

Rejected Records

\n" + "Records may be rejected because they cannot be lifted over or because of sequence incompatibilities between the " + "source and target reference genomes. Rejected records will be emitted to the REJECT file using the source " + "genome build coordinates. The reason for the rejection will be stated in the FILTER field, and more detail may be placed " + "in the INFO field.\n" + "

Memory Use

\n" + "LiftOverVcf sorts the output using a \"SortingCollection\" which relies on MAX_RECORDS_IN_RAM " + "to specify how many (vcf) records to hold in memory before \"spilling\" to disk. The default value is reasonable when sorting SAM files, " + "but not for VCFs as there is no good default due to the dependence on the number of samples and amount of information in the INFO and FORMAT " + "fields. Consider lowering to 100,000 or even less if you have many genotypes.\n"; @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input VCF/BCF file to be lifted over.") public File INPUT; @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output location for the lifted over VCF/BCF.") public File OUTPUT; @Argument(shortName = "C", doc = "The liftover chain file. See https://genome.ucsc.edu/goldenPath/help/chain.html for a description" + " of chain files. See http://hgdownload.soe.ucsc.edu/downloads.html#terms for where to download chain files.") public File CHAIN; @Argument(doc = "File to which to write rejected records.") public File REJECT; // Option on whether or not to provide a warning, or error message and exit if a missing contig is encountered @Argument(shortName = "WMC", doc = "Warn on missing contig.", optional = true) public boolean WARN_ON_MISSING_CONTIG = false; @Argument(shortName = "LFI", doc = "If true, intervals failing due to match below LIFTOVER_MIN_MATCH will be logged as a warning to the console.", optional = true) public boolean LOG_FAILED_INTERVALS = true; // Option on whether or not to write the original contig/position of the variant to the INFO field @Argument(doc = "Write the original contig/position for lifted variants to the INFO field.", optional = true) public boolean WRITE_ORIGINAL_POSITION = false; // Option on whether or not to write the original alleles of the variant to the INFO field @Argument(doc = "Write the original alleles for lifted variants to the INFO field. If the alleles are identical, this attribute will be omitted.", optional = true) public boolean WRITE_ORIGINAL_ALLELES = false; @Argument(doc = "The minimum percent match required for a variant to be lifted.", optional = true) public double LIFTOVER_MIN_MATCH = 1.0; @Argument(doc = "Allow INFO and FORMAT in the records that are not found in the header", optional = true) public boolean ALLOW_MISSING_FIELDS_IN_HEADER = false; @Argument(doc = "If the REF allele of the lifted site does not match the target genome, that variant is normally rejected. " + "For bi-allelic SNPs, if this is set to true and the ALT allele equals the new REF allele, the REF and ALT alleles will be swapped. This can rescue " + "some variants; however, do this carefully as some annotations may become invalid, such as any that are alelle-specifc. See also TAGS_TO_REVERSE and TAGS_TO_DROP.", optional = true) public boolean RECOVER_SWAPPED_REF_ALT = false; @Argument(doc = "INFO field annotations that behave like an Allele Frequency and should be transformed with x->1-x " + "when swapping reference with variant alleles.", optional = true) public Collection TAGS_TO_REVERSE = new ArrayList<>(LiftoverUtils.DEFAULT_TAGS_TO_REVERSE); @Argument(doc = "INFO field annotations that should be deleted when swapping reference with variant alleles.", optional = true) public Collection TAGS_TO_DROP = new ArrayList<>(LiftoverUtils.DEFAULT_TAGS_TO_DROP); @Argument(doc = "Output VCF file will be written on the fly but it won't be sorted and indexed.", optional = true) public boolean DISABLE_SORT = false; // When a contig used in the chain is not in the reference, exit with this value instead of 0. public static int EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE = 1; /** * Filter name to use when a target cannot be lifted over. */ public static final String FILTER_CANNOT_LIFTOVER_REV_COMP = "CannotLiftOver"; /** * Filter name to use when a target cannot be lifted over. */ public static final String FILTER_NO_TARGET = "NoTarget"; /** * Filter name to use when a target is lifted over, but the reference allele doesn't match the new reference. */ public static final String FILTER_MISMATCHING_REF_ALLELE = "MismatchedRefAllele"; /** * Filter name to use when an indel cannot be lifted over since it straddles two intervals in a chain which means * that it is unclear what are the right alleles to be used. */ public static final String FILTER_INDEL_STRADDLES_TWO_INTERVALS = "IndelStraddlesMultipleIntevals"; /** * Filters to be added to the REJECT file. */ private static final List FILTERS = CollectionUtil.makeList( new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER_REV_COMP, "Liftover of a variant that needed reverse-complementing failed for unknown reasons."), new VCFFilterHeaderLine(FILTER_NO_TARGET, "Variant could not be lifted between genome builds."), new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover."), new VCFFilterHeaderLine(FILTER_INDEL_STRADDLES_TWO_INTERVALS, "Reference allele in Indel is straddling multiple intervals in the chain, and so the results are not well defined.") ); /** * Attribute used to store the name of the source contig/chromosome prior to liftover. */ public static final String ORIGINAL_CONTIG = "OriginalContig"; /** * Attribute used to store the position of the variant on the source contig prior to liftover. */ public static final String ORIGINAL_START = "OriginalStart"; /** * Attribute used to store the list of original alleles (including REF), in the original order prior to liftover. */ public static final String ORIGINAL_ALLELES = "OriginalAlleles"; /** * Attribute used to store the position of the failed variant on the target contig prior to finding out that alleles do not match. */ public static final String ATTEMPTED_LOCUS = "AttemptedLocus"; /** * Attribute used to store the position of the failed variant on the target contig prior to finding out that alleles do not match. */ public static final String ATTEMPTED_ALLELES = "AttemptedAlleles"; /** * Metadata to be added to the Passing file. */ private static final List ATTRS = CollectionUtil.makeList( new VCFInfoHeaderLine(ORIGINAL_CONTIG, 1, VCFHeaderLineType.String, "The name of the source contig/chromosome prior to liftover."), new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover."), new VCFInfoHeaderLine(ORIGINAL_ALLELES, VCFHeaderLineCount.R, VCFHeaderLineType.String, "A list of the original alleles (including REF) of the variant prior to liftover. If the alleles were not changed during liftover, this attribute will be omitted.") ); private VariantContextWriter rejectedRecords; /** the output VariantContextWriter */ private VariantContextWriter acceptedRecords; private final Log log = Log.getInstance(LiftoverVcf.class); /** the Variant sorter, may be null if DISABLE_SORT = true */ private SortingCollection sorter; private long failedLiftover = 0, failedAlleleCheck = 0, totalTrackedAsSwapRefAlt = 0; private final Map rejectsByContig = new TreeMap<>(); private final Map liftedByDestContig = new TreeMap<>(); private final Map liftedBySourceContig = new TreeMap<>(); @Override protected ReferenceArgumentCollection makeReferenceArgumentCollection() { return new ReferenceArgumentCollection() { @Argument(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, common=false, doc = "The reference sequence (fasta) for the TARGET genome build (i.e., the new one. The fasta file must have an " + "accompanying sequence dictionary (.dict file).") public File REFERENCE_SEQUENCE = Defaults.REFERENCE_FASTA; @Override public File getReferenceFile() { return REFERENCE_SEQUENCE; } }; } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsWritable(REJECT); if (CREATE_INDEX && DISABLE_SORT) { log.error("CREATE_INDEX=true and DISABLE_SORT=true are mutually exclusive."); return 1; } //////////////////////////////////////////////////////////////////////// // Setup the inputs //////////////////////////////////////////////////////////////////////// final LiftOver liftOver = new LiftOver(CHAIN); liftOver.setShouldLogFailedIntervalsBelowThreshold(LOG_FAILED_INTERVALS); final VCFFileReader in = new VCFFileReader(INPUT, false); log.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final Map refSeqs = new HashMap<>(); // check if sequence dictionary exists if (walker.getSequenceDictionary() == null) { log.error("Reference " + REFERENCE_SEQUENCE.getAbsolutePath() + " must have an associated Dictionary .dict file in the same directory."); return 1; } for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex())); } CloserUtil.close(walker); //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// final VCFHeader inHeader = in.getFileHeader(); // build VCF header, remove the old '##reference=file://...' final VCFHeader outHeader = new VCFHeader( inHeader.getMetaDataInInputOrder() .stream() .filter(M->!M.getKey().equals("reference")) .collect(Collectors.toCollection(LinkedHashSet::new)), inHeader.getSampleNamesInOrder() ); outHeader.setSequenceDictionary(walker.getSequenceDictionary()); if (WRITE_ORIGINAL_POSITION) { for (final VCFInfoHeaderLine line : ATTRS) outHeader.addMetaDataLine(line); } outHeader.addMetaDataLine(new VCFInfoHeaderLine(LiftoverUtils.SWAPPED_ALLELES, 0, VCFHeaderLineType.Flag, "The REF and the ALT alleles have been swapped in liftover due to changes in the reference. " + "It is possible that not all INFO annotations reflect this swap, and in the genotypes, " + "only the GT, PL, and AD fields have been modified. You should check the TAGS_TO_REVERSE parameter that was used " + "during the LiftOver to be sure.")); outHeader.addMetaDataLine(new VCFInfoHeaderLine(LiftoverUtils.REV_COMPED_ALLELES, 0, VCFHeaderLineType.Flag, "The REF and the ALT alleles have been reverse complemented in liftover since the mapping from the " + "previous reference to the current one was on the negative strand.")); outHeader.addMetaDataLine(new VCFHeaderLine("reference", REFERENCE_SEQUENCE.toURI().toString())); this.acceptedRecords = new VariantContextWriterBuilder() .modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER) .modifyOption(Options.INDEX_ON_THE_FLY,!DISABLE_SORT) .setOutputFile(OUTPUT) .setReferenceDictionary(walker.getSequenceDictionary()) .build(); this.acceptedRecords.writeHeader(outHeader); rejectedRecords = new VariantContextWriterBuilder().setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) { rejectHeader.addMetaDataLine(line); } rejectHeader.addMetaDataLine(new VCFInfoHeaderLine(ATTEMPTED_LOCUS, 1, VCFHeaderLineType.String, "The locus of the variant in the TARGET prior to failing due to reference allele mismatching to the target reference.")); rejectHeader.addMetaDataLine(new VCFInfoHeaderLine(ATTEMPTED_ALLELES, 1, VCFHeaderLineType.String, "The alleles of the variant in the TARGET prior to failing due to reference allele mismatching to the target reference.")); rejectedRecords.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// long total = 0; if (DISABLE_SORT) { log.info("Lifting variants over and writing the output file. Variants will not be sorted."); sorter = null; } else { log.info("Lifting variants over and sorting (not yet writing the output file.)"); sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader, ALLOW_MISSING_FIELDS_IN_HEADER || VALIDATION_STRINGENCY != ValidationStringency.STRICT), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); } ProgressLogger progress = new ProgressLogger(log, 1000000, "read"); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, LIFTOVER_MIN_MATCH); // target is null when there is no good liftover for the context. This happens either when it fall in a gap // where there isn't a chain, or if a large enough proportion of it is diminished by the "deletion" at the // end of each interval in a chain. if (target == null) { rejectVariant(ctx, FILTER_NO_TARGET); continue; } // the target is the lifted-over interval comprised of the start/stop of the variant context, // if the sizes of target and ctx do not match, it means that the interval grew or shrank during // liftover which must be due to straddling multiple intervals in the liftover chain. // This would invalidate the indel as it isn't clear what the resulting alleles should be. if (ctx.getReference().length() != target.length()) { rejectVariant(ctx, FILTER_INDEL_STRADDLES_TWO_INTERVALS); continue; } final ReferenceSequence refSeq; if (!refSeqs.containsKey(target.getContig())) { rejectVariant(ctx, FILTER_NO_TARGET); final String missingContigMessage = "Encountered a contig, " + target.getContig() + " that is not part of the target reference."; if (WARN_ON_MISSING_CONTIG) { log.warn(missingContigMessage); } else { log.error(missingContigMessage); return EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE; } } else { refSeq = refSeqs.get(target.getContig()); final VariantContext liftedVC = LiftoverUtils.liftVariant(ctx, target, refSeq, WRITE_ORIGINAL_POSITION, WRITE_ORIGINAL_ALLELES); // the liftedVC can be null if the liftover fails because of a problem with reverse complementing if (liftedVC == null) { rejectVariant(ctx, FILTER_CANNOT_LIFTOVER_REV_COMP); } else { tryToAddVariant(liftedVC, refSeq, ctx); } } progress.record(ctx.getContig(), ctx.getStart()); } final NumberFormat pfmt = new DecimalFormat("0.0000%"); final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total); log.info("Processed ", total, " variants."); log.info(failedLiftover, " variants failed to liftover."); log.info(failedAlleleCheck, " variants lifted over but had mismatching reference alleles after lift over."); log.info(pct, " of variants were not successfully lifted over and written to the output."); final Set contigUnion = new TreeSet<>(); contigUnion.addAll(liftedBySourceContig.keySet()); contigUnion.addAll(rejectsByContig.keySet()); log.info("liftover success by source contig:"); for (String contig : contigUnion) { final long success = liftedBySourceContig.getOrDefault(contig, 0L); final long fail = rejectsByContig.getOrDefault(contig, 0L); final String liftPct = pfmt.format((double)success / (double)(success + fail)); log.info(contig, ": ", success, " / ", (success + fail), " (", liftPct, ")"); } log.info("lifted variants by target contig:"); for (String contig : liftedByDestContig.keySet()) { log.info(contig, ": ", liftedByDestContig.get(contig)); } if (liftedByDestContig.isEmpty()) { log.info("no successfully lifted variants"); } if (RECOVER_SWAPPED_REF_ALT) { log.info(totalTrackedAsSwapRefAlt, " variants were lifted by swapping REF/ALT alleles."); } else { log.warn(totalTrackedAsSwapRefAlt, " variants with a swapped REF/ALT were identified, but were not recovered. See RECOVER_SWAPPED_REF_ALT and associated caveats."); } rejectedRecords.close(); in.close(); if (!DISABLE_SORT) { //////////////////////////////////////////////////////////////////////// // Write the sorted outputs to the final output file //////////////////////////////////////////////////////////////////////// sorter.doneAdding(); progress = new ProgressLogger(log, 1000000, "written"); log.info("Writing out sorted records to final VCF."); for (final VariantContext ctx : sorter) { this.acceptedRecords.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } sorter.cleanup(); } this.acceptedRecords.close(); return 0; } private void rejectVariant(final VariantContext ctx, final String reason) { rejectedRecords.add(new VariantContextBuilder(ctx).filter(reason).make()); failedLiftover++; trackLiftedVariantContig(rejectsByContig, ctx.getContig()); } private void trackLiftedVariantContig(final Map map, final String contig) { Long val = map.get(contig); if (val == null) { val = 0L; } map.put(contig, ++val); } private void addAndTrack(final VariantContext toAdd, final VariantContext source) { trackLiftedVariantContig(liftedBySourceContig, source.getContig()); trackLiftedVariantContig(liftedByDestContig, toAdd.getContig()); if (!DISABLE_SORT) { //we're sorting the variants sorter.add(toAdd); } else { this.acceptedRecords.add(toAdd); } } /** * utility function to attempt to add a variant. Checks that the reference allele still matches the reference (which may have changed) * * @param vc new {@link VariantContext} * @param refSeq {@link ReferenceSequence} of new reference * @param source the original {@link VariantContext} to use for putting the original location information into vc */ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence refSeq, final VariantContext source) { if (!refSeq.getName().equals(vc.getContig())) { throw new IllegalStateException("The contig of the VariantContext, " + vc.getContig() + ", doesnt match the ReferenceSequence: " + refSeq.getName()); } // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; for (final Allele allele : vc.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeq.getBases(); final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd() - vc.getStart() + 1); if (!refString.equalsIgnoreCase(allele.getBaseString())) { // consider that the ref and the alt may have been swapped in a simple biallelic SNP if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) { if (RECOVER_SWAPPED_REF_ALT) { totalTrackedAsSwapRefAlt++; addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source); return; } else { totalTrackedAsSwapRefAlt++; } } mismatchesReference = true; } break; } } if (mismatchesReference) { rejectedRecords.add(new VariantContextBuilder(source) .filter(FILTER_MISMATCHING_REF_ALLELE) .attribute(ATTEMPTED_LOCUS, String.format("%s:%d-%d", vc.getContig(), vc.getStart(), vc.getEnd())) .attribute(ATTEMPTED_ALLELES, vc.getReference().toString() + "->" + String.join(",", vc.getAlternateAlleles().stream().map(Allele::toString).collect(Collectors.toList()))) .make()); failedAlleleCheck++; trackLiftedVariantContig(rejectsByContig, source.getContig()); } else { addAndTrack(vc, source); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy