All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.biodata.tools.alignment.BamUtils Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/*
 * 
 *
 */

package org.opencb.biodata.tools.alignment;

import htsjdk.samtools.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.broad.igv.bbfile.*;
import org.opencb.biodata.models.alignment.RegionCoverage;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.tools.alignment.exceptions.AlignmentCoverageException;
import org.opencb.biodata.tools.feature.BigWigManager;
import org.opencb.commons.utils.CollectionUtils;
import org.opencb.commons.utils.FileUtils;

import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
 * Created by pfurio on 25/10/16.
 */
public class BamUtils {

    /**
     * Adjusts the quality value for optimized 8-level mapping quality scores.
     * Quality range -> Mapped quality
     * 1     ->  1
     * 2-9   ->  6
     * 10-19 ->  15
     * 20-24 ->  22
     * 25-29 ->  27
     * 30-34 ->  33
     * 35-39 ->  27
     * >=40  ->  40
     * Read more: http://www.illumina.com/documents/products/technotes/technote_understanding_quality_scores.pd
     *
     * @param quality original quality
     * @return Adjusted quality
     */
    public static int adjustQuality(int quality) {
        final int adjustedQuality;

        if (quality <= 1) {
            adjustedQuality = quality;
        } else {
            int qualRange = quality / 5;
            switch (qualRange) {
                case 0:
                case 1:
                    adjustedQuality = 6;
                    break;
                case 2:
                case 3:
                    adjustedQuality = 15;
                    break;
                case 4:
                    adjustedQuality = 22;
                    break;
                case 5:
                    adjustedQuality = 27;
                    break;
                case 6:
                    adjustedQuality = 33;
                    break;
                case 7:
                    adjustedQuality = 37;
                    break;
                case 8:
                default:
                    adjustedQuality = 40;
                    break;
            }
        }
        return adjustedQuality;
    }

    public static List adjustQuality(List qualities) {
        List adjustedQualities = new ArrayList<>(qualities.size());
        for (Integer quality : qualities) {
            adjustedQualities.add(adjustQuality(quality));
        }
        return adjustedQualities;
    }

    public static SAMFileHeader getFileHeader(Path input) throws IOException {
        FileUtils.checkFile(input);

        SamReaderFactory srf = SamReaderFactory.make();
        srf.validationStringency(ValidationStringency.LENIENT);
        SamReader reader = srf.open(SamInputResource.of(input.toFile()));
        SAMFileHeader fileHeader = reader.getFileHeader();
        reader.close();

        return fileHeader;
    }

    /**
     * Check if the file is a sorted binary bam file.
     *
     * @param is          Bam InputStream
     * @param bamFileName Bam FileName
     * @throws IOException
     */
    public static void checkBamOrCramFile(InputStream is, String bamFileName) throws IOException {
        checkBamOrCramFile(is, bamFileName, true);
    }

    /**
     * Check if the file is a sorted binary bam file.
     *
     * @param is          Bam InputStream
     * @param bamFileName Bam FileName
     * @param checkSort
     * @throws IOException
     */
    public static void checkBamOrCramFile(InputStream is, String bamFileName, boolean checkSort) throws IOException {
        SamReaderFactory srf = SamReaderFactory.make();
        srf.validationStringency(ValidationStringency.LENIENT);

        SamReader reader = srf.open(SamInputResource.of(is));
        SAMFileHeader fileHeader = reader.getFileHeader();
        SAMFileHeader.SortOrder sortOrder = fileHeader.getSortOrder();
        reader.close();

        if (reader.type().equals(SamReader.Type.SAM_TYPE)) {
            throw new IOException("Expected binary SAM file. File " + bamFileName + " is not binary.");
        }

        if (checkSort) {
            switch (sortOrder) {
                case coordinate:
                    break;
                case queryname:
                case unsorted:
                default:
                    throw new IOException("Expected sorted file. File '" + bamFileName + "' is not sorted by coordinates("
                            + sortOrder.name() + ")");
            }
        }
    }

    /**
     * Return the coverage average given a window size from the BigWig file passed.
     * @param region Region from which return the coverage
     * @param windowSize Window size to average
     * @param bigwigPath BigWig path with coverage
     * @return One average score per window size spanning the region
     * @throws IOException If any error happens reading BigWig file
     */
    public static RegionCoverage getCoverageFromBigWig(Region region, int windowSize, Path bigwigPath) throws IOException {
        FileUtils.checkFile(bigwigPath);

        BigWigManager bigWigManager = new BigWigManager(bigwigPath);
        double[] avgCoverage = bigWigManager.groupBy(region, windowSize);
        return new RegionCoverage(region, windowSize, avgCoverage);
    }

    /**
     * Write in wig file format the coverage for the region given. It uses fixedStep with step equals to 1.
     *
     * @param regionCoverage    Region containing the coverage values
     * @param span              Span (to group coverage contiguous values in a mean coverage)
     * @param header            Flag, to write a header line (assuming fixedStep, and start=1 and step=1)
     * @param writer            File writer
     */
    public static void printWigFormatCoverage(RegionCoverage regionCoverage, int span, boolean header, PrintWriter writer) {
        // sanity check
        if (span < 1) {
            span = 1;
        }
        if (header) {
            writer.println("fixedStep chrom=" + regionCoverage.getChromosome() + " start=1 step=1 span=" + span);
        }
        double[] values = regionCoverage.getValues();
        if (span == 1) {
            for (int i = 0; i < values.length; i++) {
                writer.println(values[i]);
            }
        } else {
            int counter = 0;
            int sum = 0;
            for (int i = 0; i < values.length; i++) {
                counter++;
                sum += values[i];
                if (counter == span) {
                    writer.println(sum / counter);
                    counter = 0;
                    sum = 0;
                }
            }
            if (counter > 0) {
                writer.println(sum / counter);
            }
        }
    }

    /**
     * Utility to compute the coverage from a BAM file, and create a wig format file with the coverage values according
     * to a window size (i.e., span in wig format specification).
     *
     * @param bamPath           BAM file
     * @param coveragePath      Wig file name where to save coverage values
     * @param span              Span (to group coverage contiguous values in a mean coverage)
     * @throws IOException
     */
    public static void createCoverageWigFile(Path bamPath, Path coveragePath, int span) throws IOException, AlignmentCoverageException {
        SAMFileHeader fileHeader = BamUtils.getFileHeader(bamPath);

        AlignmentOptions options = new AlignmentOptions();
        options.setContained(false);

        BamManager alignmentManager = new BamManager(bamPath);
        Iterator iterator = fileHeader.getSequenceDictionary().getSequences().iterator();
        PrintWriter writer = new PrintWriter(coveragePath.toFile());
        // chunkSize = 100000 (too small, it takes loooooong...)
        int chunkSize = Math.max(span, 200000 / span * span);
        while (iterator.hasNext()) {
            SAMSequenceRecord next = iterator.next();
            for (int i = 0; i < next.getSequenceLength(); i += chunkSize) {
                Region region = new Region(next.getSequenceName(), i + 1, Math.min(i + chunkSize, next.getSequenceLength()));
                RegionCoverage regionCoverage = alignmentManager.coverage(region, null, options);
                printWigFormatCoverage(regionCoverage, span, (i == 0), writer);
            }
        }
        writer.close();
    }

    public static void validateRegion(Region region, SamReader samReader) {
        String chrom = region.getChromosome();
        if (StringUtils.isEmpty(chrom)) {
            throw new IllegalArgumentException("Missing chromosome for region: " + region.toString());
        }

        SAMSequenceDictionary sequenceDictionary = samReader.getFileHeader().getSequenceDictionary();
        if (sequenceDictionary.getSequenceIndex(chrom) == -1) {
            if (chrom.startsWith("chr")) {
                chrom = chrom.replace("chr", "");
                if (sequenceDictionary.getSequenceIndex(chrom) == -1) {
                    throw new IllegalArgumentException("Unknown chromosome: " + region.getChromosome());
                } else {
                    region.setChromosome(chrom);
                }
            } else {
                if (sequenceDictionary.getSequenceIndex("chr" + chrom) == -1) {
                    throw new IllegalArgumentException("Unknown chromosome: " + region.getChromosome());
                } else {
                    region.setChromosome("chr" + chrom);
                }
            }
        }
    }

    /**
     * Return a list of RegionCoverage with a coverage less than or equal to the input maximum coverage.
     * @param coverageRegion
     * @param maxCoverage
     * @return
     * @throws IOException
     */
    public static List filterByCoverage(RegionCoverage coverageRegion, int minCoverage, int maxCoverage) {
        List selectedRegions = new ArrayList<>();

        double[] coverages = new double[coverageRegion.size()];
        int i = 0;
        int pos = coverageRegion.getStart();
        boolean isProcessing = false;
        RegionCoverage uncoveredRegion = null;
        for (double coverage: coverageRegion.getValues()) {
            if (coverage >= minCoverage && coverage <= maxCoverage) {
                if (!isProcessing) {
                    uncoveredRegion = new RegionCoverage(coverageRegion.getChromosome(), pos, 0);
                    isProcessing = true;
                    i = 0;
                }
                coverages[i] = coverage;
                i++;
            } else {
                if (isProcessing) {
                    uncoveredRegion.setEnd(pos - 1);
                    uncoveredRegion.setValues(Arrays.copyOf(coverages, i));
                    uncoveredRegion.updateStats();
                    selectedRegions.add(uncoveredRegion);
                    isProcessing = false;
                }
            }
            pos++;
        }

        // Check if a uncovered region is still processing
        if (isProcessing) {
            uncoveredRegion.setEnd(pos - 1);
            uncoveredRegion.setValues(Arrays.copyOf(coverages, i));
            uncoveredRegion.updateStats();
            selectedRegions.add(uncoveredRegion);
        }

        return selectedRegions;
    }

    /*

import math
import csv
CANONIC_CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5',
                       'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
                       'chr11', 'chr12', 'chr13', 'chr14',
                       'chr15', 'chr16', 'chr17', 'chr18',
                       'chr19', 'chr20', 'chr21', 'chr22',
                       'chrX', 'chrY']
def process_coverage(cov_bw_tumour_fpath, cov_bw_normal_fpath, output_dir):
    """
    Calculate coverage to be used posteriorly for some tracks in circos plot

    :type cov_bw_tumour_fpath: str
    :param cov_bw_tumour_fpath: bigWig file path for tumour sample

    :type cov_bw_normal_fpath: str
    :param cov_bw_normal_fpath: bigWig file path for normal sample

    :type output_dir: str
    :param output_dir: output folder path
    """
    # Calculating ratio of normalised coverage
    produce_normalised_coverage(cov_bw_tumour_fpath, cov_bw_normal_fpath,
                                output_dir)
def get_region_norm_cov(region_counts_tumour, region_counts_normal,
                        total_counts_tumour, total_counts_normal):
    """
    Calculate ratio of normalised depth of coverage in log2 scale for a region

    :type region_counts_tumour: int
    :param region_counts_tumour: counts for region in tumour

    :type region_counts_normal: int
    :param region_counts_normal: counts for region in normal

    :type total_counts_tumour: int
    :param total_counts_tumour: total counts for tumour

    :type total_counts_normal: int
    :param total_counts_normal: total counts for normal
    """
    # Rescaling both coverages
    rescaled_tumour_cov = (region_counts_tumour/float(total_counts_tumour))*1000
    rescaled_normal_cov = (region_counts_normal/float(total_counts_normal))*1000
    # Returning None if tumour or normal coverage are low
    if (float(region_counts_normal)/100000) < 15:  # Raw mean cov is < 15
        return None
    if rescaled_normal_cov == 0:
        return None
    # Getting coverage ratio
    cov_ratio = float(rescaled_tumour_cov)/float(rescaled_normal_cov)
    if cov_ratio == 0:
        return None
    # Getting normalised coverage (log2)
    # log2(x) = log10(x) / log10(2)
    coverage_log2 = math.log10(cov_ratio)/math.log10(2)
    if coverage_log2 > 4:  # cov_ratio > 16
        return 4
    return coverage_log2

def get_total_counts(coverage_bw_fpath):
    """
    Get bigWig total coverage

    :type: coverage_bw_fpath: file
    :param coverage_bw_fpath: bigWig file
    """
    cov_fhand = open(coverage_bw_fpath, 'r')
    total_coverage = 0
    for line in cov_fhand:
        if line.startswith('#'):  # Skip header
            continue
        line = line.strip().split("\t")
        total_coverage += (float(line[7]) * float(line[3]))  # (size * mean)
    cov_fhand.close()
    return total_coverage

def sort_canonic_bigwig(bw_fpath, header=True):
    """
    Sort a bigWig file by canonic chromosome and start
    :type bw_fpath: str
    :param bw_fpath: bigwig file path
    :type header: bool
    :param header: indicates if file has header
    """
    bw_fhand = open(bw_fpath, 'r')
    csv_file = csv.reader(bw_fhand, delimiter='\t')
    # Returning header
    if header:
        yield '\t'.join(next(csv_file))
    # Sorting file by chromosome and start
    sort = sorted(csv_file, key=lambda x: (x[0], int(x[1])))
    for line in sort:
        # Removing non-canonic chromosomes
        if line[0] not in CANONIC_CHROMOSOMES:
            continue
        yield '\t'.join(line)
    bw_fhand.close()

def produce_normalised_coverage(cov_bw_tumour_fpath, cov_bw_normal_fpath,
                                output_dir):
    """
    Given tumour and normal coverage bigWig files, calculate ratio of
    normalised depth of coverage in log2 scale

    :type cov_bw_tumour_fpath: str
    :param cov_bw_tumour_fpath: bigWig file path for tumour sample

    :type cov_bw_normal_fpath: str
    :param cov_bw_normal_fpath: bigWig file path for normal sample

    :type output_dir: str
    :param output_dir: output folder path
    """
    # Output file
    cov_norm = open(output_dir + '/coverage.txt', 'w')
    # Getting total coverage for both coverage files
    total_counts_tumour = get_total_counts(cov_bw_tumour_fpath)
    total_counts_normal = get_total_counts(cov_bw_normal_fpath)
    # Opening coverage input files
    file_a = sort_canonic_bigwig(cov_bw_tumour_fpath)
    file_b = sort_canonic_bigwig(cov_bw_normal_fpath)
    # Skipping headers
    next(file_a)
    next(file_b)
    # Iterating over the two files at the same time
    for line_a in file_a:
        a_data = bw_line_to_dict(line_a)  # Current line in a
        b_data = bw_line_to_dict(next(file_b))  # Current line in b
        # Checking that we are in the same region in both files
        assert a_data['chr'] == b_data['chr']
        assert a_data['start'] == b_data['start']
        # Get ratio of normalised coverage
        normalised = get_region_norm_cov(a_data["counts"], b_data["counts"],
                                         total_counts_tumour,
                                         total_counts_normal)
        if normalised is not None:
            cov_norm.write(str(a_data['chr']).replace("chr", "hs") + " " +
                           str(a_data['start']) + " " +
                           str(a_data['end']) + " " +
                           str(normalised) + "\n")
    cov_norm.close()
def bw_line_to_dict(bw_line):
    """
    Given a bigWig file, return a dictionary with some of its data
    :type bw_line: str
    :param bw_line: line from bigWig file
    """
    bw_line = bw_line.strip().split('\t')
    bw_line_data = {
        'chr': bw_line[0],
        'start': int(bw_line[1]),
        'end': int(bw_line[2]),
        'size': int(bw_line[3]),
        'mean': float(bw_line[7])
    }
    bw_line_data['counts'] = (float(bw_line_data['mean']) *
                              float(bw_line_data['size']))
    return bw_line_data
     */

    public static RegionCoverage coverageRatio(RegionCoverage coverage1, long totalCounts1, RegionCoverage coverage2, long totalCounts2,
                                               boolean applyLog) throws AlignmentCoverageException {
        if (coverage1 != null && coverage1.getValues() != null && coverage2 != null && coverage2.getValues() != null
                && coverage1.getWindowSize() == coverage2.getWindowSize() && coverage1.getValues().length == coverage2.getValues().length) {

            double values[] = new double[coverage1.getValues().length];

            double log10_2 = Math.log10(2);

            for (int i = 0; i < values.length; i++) {
                // Checking if tumour or normal coverage are low, raw mean cov is >= 15
                // Rescaling both coverages
                double rescaledCoverage1 = coverage1.getValues()[i] / totalCounts1;
                double rescaledCoverage2 = coverage2.getValues()[i] / totalCounts2;

                if (rescaledCoverage1 > 0 && rescaledCoverage2 > 0) {
                    // Getting coverage ratio
                    double covRatio = rescaledCoverage1 / rescaledCoverage2;
                    if (applyLog) {
                        // Getting normalised coverage (log2)
                        // log2(x) = log10(x) / log10(2)

                        values[i] = Math.log10(covRatio) / log10_2;
                    } else {
                        values[i] = covRatio;
                    }
                } else {
                    values[i] = Double.NaN;
                }
            }

            Region region = new Region(coverage1.getChromosome(), coverage1.getStart(), coverage1.getEnd());
            return new RegionCoverage(region, coverage1.getWindowSize(), values);
        } else {
            throw new AlignmentCoverageException("Something wrong computing coverage ratio");
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy