All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.validation.validate-reads-spark-pipeline.sh Maven / Gradle / Ivy

The newest version!
#!/usr/bin/env bash

# This script takes a bam (plus context), rolls back the effects of MarkDuplicates and BQSR, and runs the BAM
# through Picard/GATK 3 and GATK 4 and compares the legacy results with GATK 4.
#
# This requires that PICARD_JAR and GATK3_JAR be set.
#
# Example usage
# ./validate-reads-spark-pipeline.sh \
#   src/test/resources/org/broadinstitute/hellbender/tools/validation/CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.bam \
#   src/test/resources/large/human_g1k_v37.20.21.fasta \
#   src/test/resources/large/human_g1k_v37.20.21.2bit \
#   src/test/resources/large/dbsnp_138.b37.20.21.vcf \
#   [optionally, throwOnDiff]
#   [optionally, saveIntermediateFiles]

set -ex

# Find the path to the binary
if [ -z "$GATK_HOME" ]; then
  GATK_HOME=`pwd`
fi

if [ -z "$PICARD_JAR" ]; then
  echo "ERROR: PICARD_JAR must be set to the Picard jar, you can get it here: http://broadinstitute.github.io/picard/"  1>&2
  exit 1
fi

if [ -z "$GATK3_JAR" ]; then
  echo "ERROR: GATK3_JAR must be set to the GATK 3 jar, you can get it here: https://www.broadinstitute.org/gatk/download/"  1>&2
  exit 1
fi

if [ "$#" -lt 4 ]; then
    echo "Wrong number of parameters, needs:     [throwOnDiff] [saveIntermediateBams]"
fi

CRASH_ON_DIFF="false"
if [ "$#" -ge 5 ] && [ "$5" == "throwOnDiff" ]; then
    CRASH_ON_DIFF="true"
fi
if [ "$#" -ge 6 ] && [ "$6" == "throwOnDiff" ]; then
    CRASH_ON_DIFF="true"
fi

SAVE_INTERMEDIATE_FILES="false"
if [ "$#" -ge 5 ] && [ "$5" == "saveIntermediateFiles" ]; then
    SAVE_INTERMEDIATE_FILES="true"
fi
if [ "$#" -ge 6 ] && [ "$6" == "saveIntermediateFiles" ]; then
    SAVE_INTERMEDIATE_FILES="true"
fi

GATK4_JAR="${GATK_HOME}/build/install/gatk/bin/gatk"

TMP_FILE=${1%.bam}
FILE_WO_EXTENSION=${TMP_FILE##*/}


# Copy the bam
TEMP_BAM=tmp.${FILE_WO_EXTENSION}.bam
UNMARKED_BAM=tmp.unmarked.${FILE_WO_EXTENSION}.bam
CLEAN_BAM=tmp.clean.${FILE_WO_EXTENSION}.bam
MARKED_BAM=tmp.marked.${FILE_WO_EXTENSION}.bam
GATK4_BAM=tmp.gatk4.${FILE_WO_EXTENSION}.bam
RECAL_TABLE=tmp.${FILE_WO_EXTENSION}.recal.table

PICARD_MARKED_BAM=tmp.picard.marked.${FILE_WO_EXTENSION}.bam
LEGACY_RECAL_TABLE=tmp.legacy.${FILE_WO_EXTENSION}.recal.table
LEGACY_BAM=tmp.legacy.${FILE_WO_EXTENSION}.bam

cp $1 ${TEMP_BAM}
echo ${TEMP_BAM}



echo "################################################"
echo "###### Undoing MarkDuplicates and BQSR #########"
echo "################################################"
# Undo MarkDuplicates
${GATK4_JAR} UnmarkDuplicates -I ${TEMP_BAM} -O ${UNMARKED_BAM}

# Undo BQSR
${GATK4_JAR} RevertBaseQualityScores -I ${UNMARKED_BAM} -O ${CLEAN_BAM}


echo "################################################"
echo "############## Run GATK 4 tools ################"
echo "################################################"
# Run MarkDuplicates
${GATK4_JAR} MarkDuplicatesSpark -I ${CLEAN_BAM} -O ${MARKED_BAM} --metrics-file=tmp.metrics -DS SUM_OF_BASE_QUALITIES

# Run BQSR
${GATK4_JAR} BaseRecalibratorSpark -I ${MARKED_BAM} -R $3 -knownSites $4 -O ${RECAL_TABLE}
${GATK4_JAR} ApplyBQSRSpark -bqsr_recal_file ${RECAL_TABLE} -I ${MARKED_BAM} -O ${GATK4_BAM}



echo "################################################"
echo "########### Running legacy tools ###############"
echo "################################################"
# Run Picard MarkDuplicates
java -jar ${PICARD_JAR} MarkDuplicates I=${CLEAN_BAM} O=${PICARD_MARKED_BAM} METRICS_FILE=tmp.picard.metrics VALIDATION_STRINGENCY=LENIENT CREATE_INDEX=true PROGRAM_RECORD_ID=null DS=SUM_OF_BASE_QUALITIES

# Run GATK 3 BQSR
java -jar ${GATK3_JAR} -T BaseRecalibrator -I ${PICARD_MARKED_BAM} -R $2 -knownSites $4 -o ${LEGACY_RECAL_TABLE}
java -jar ${GATK3_JAR} -T PrintReads -R $2 -BQSR ${LEGACY_RECAL_TABLE} -I ${PICARD_MARKED_BAM} -o ${LEGACY_BAM} -DIQ

echo "################################################"
echo "############### Run diff tools #################"
echo "################################################"
${GATK4_JAR} CompareDuplicatesSpark -I ${LEGACY_BAM} -I2 ${GATK4_BAM} -cd ${CRASH_ON_DIFF}

${GATK4_JAR} CompareBaseQualities -cd ${CRASH_ON_DIFF} --VALIDATION_STRINGENCY SILENT ${LEGACY_BAM} ${GATK4_BAM} 

if [ "$SAVE_INTERMEDIATE_FILES" == "false" ]; then
    echo "deleting temp results"
    rm tmp.*.bam tmp.*.bai tmp.metrics tmp.picard.metrics .tmp*
fi




© 2015 - 2025 Weber Informatics LLC | Privacy Policy