All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.biodata.tools.alignment.BamManager Maven / Gradle / Ivy

/*
 * 
 *
 */

package org.opencb.biodata.tools.alignment;

import ga4gh.Reads;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.FastaSequenceIndexCreator;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.Log;
import org.apache.commons.collections.CollectionUtils;
import org.ga4gh.models.ReadAlignment;
import org.opencb.biodata.formats.alignment.AlignmentConverter;
import org.opencb.biodata.models.alignment.AlignmentHeader;
import org.opencb.biodata.models.alignment.RegionCoverage;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.tools.alignment.coverage.SamRecordRegionCoverageCalculator;
import org.opencb.biodata.tools.alignment.exceptions.AlignmentCoverageException;
import org.opencb.biodata.tools.alignment.filters.AlignmentFilters;
import org.opencb.biodata.tools.alignment.iterators.BamIterator;
import org.opencb.biodata.tools.alignment.iterators.SAMRecordToAvroReadAlignmentBamIterator;
import org.opencb.biodata.tools.alignment.iterators.SAMRecordToProtoReadAlignmentBamIterator;
import org.opencb.biodata.tools.alignment.iterators.SamRecordBamIterator;
import org.opencb.biodata.tools.alignment.stats.AlignmentGlobalStats;
import org.opencb.biodata.tools.alignment.stats.SamRecordAlignmentGlobalStatsCalculator;
import org.opencb.commons.utils.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * Created by imedina on 14/09/15.
 */
public class BamManager implements AutoCloseable {

    private Path bamFile;
    private Path refFile;
    private SamReader samReader;

    public static final int DEFAULT_WINDOW_SIZE = 1;
    public static final int MAX_NUM_RECORDS = 50000;
    public static final int MAX_REGION_COVERAGE = 100000;
    public static final String COVERAGE_BIGWIG_EXTENSION = ".bw";

    private Logger logger;

    public BamManager(Path bamFilePath) throws IOException {
        this(bamFilePath, null);
    }

    public BamManager(Path bamFilePath, Path refFilePath) throws IOException {
        this.bamFile = bamFilePath;
        this.refFile = refFilePath;

        this.init();
    }

    private void init() throws IOException {
        FileUtils.checkFile(bamFile);

        if (this.samReader == null) {
            SamReaderFactory srf = SamReaderFactory.make();
            srf.validationStringency(ValidationStringency.LENIENT);
            if (bamFile.toString().endsWith("cram")) {
                if (refFile == null) {
                    throw new IOException("Missing reference file for CRAM file " + bamFile);
                } else {
                    FileUtils.checkFile(refFile);
                    srf.referenceSequence(refFile);
                }
            }
            this.samReader = srf.open(SamInputResource.of(bamFile.toFile()));

        }

        logger = LoggerFactory.getLogger(BamManager.class);
    }

    /**
     * Creates a index file for the BAM or CRAM input file.
     * @return The path of the index file.
     * @throws IOException
     */
    public Path createIndex() throws IOException {
        String ext = ".bai";
        if (bamFile.toString().endsWith(".cram")) {
            ext = ".crai";
        }
        Path indexPath = bamFile.getParent().resolve(bamFile.getFileName().toString() + ext);
        return createIndex(indexPath);
    }

    /**
     * Creates a BAM/CRAM index file.
     * @param outputIndex The bai index file to be created.
     * @return
     * @throws IOException
     */
    public Path createIndex(Path outputIndex) throws IOException {
        FileUtils.checkDirectory(outputIndex.toAbsolutePath().getParent(), true);

        SamReaderFactory srf = SamReaderFactory.make().enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS);
        srf.validationStringency(ValidationStringency.LENIENT);

        // If a reference file is provided, use it (this is mandatory for CRAM files)
        // If the input file is CRAM, at this point, we are sure that refFile is not null (this condition is checked
        // in the init function)
        if (refFile != null) {
            srf.referenceSequence(refFile);
        }

        try (SamReader reader = srf.open(SamInputResource.of(bamFile.toFile()))) {
            // Files need to be sorted by coordinates to create the index
            SAMFileHeader.SortOrder sortOrder = reader.getFileHeader().getSortOrder();
            if (!sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
                throw new IOException("Sorted file expected. File '" + bamFile.toString()
                        + "' is not sorted by coordinates (" + sortOrder.name() + ")");
            }

            if (reader.type().equals(SamReader.Type.BAM_TYPE)) {
                BAMIndexer.createIndex(reader, outputIndex.toFile(), Log.getInstance(BamManager.class));
            } else {
                if (reader.type().equals(SamReader.Type.CRAM_TYPE)) {
                    // First, index the CRAM file, it will produce a CRAI file
                    SeekableStream streamFor = SeekableStreamFactory.getInstance().getStreamFor(bamFile.toString());
                    CRAMCRAIIndexer.writeIndex(streamFor, new FileOutputStream(outputIndex.toFile()));

                    // Second, index the reference FASTA file, it will produce a FAI file
                    FastaSequenceIndexCreator.create(refFile, true);
                } else {
                    throw new IOException("This is not a BAM or CRAM file. SAM files cannot be indexed");
                }
            }
        }
        return outputIndex;
    }

    public AlignmentHeader getHeader(String studyId) throws IOException {
        init();
        return AlignmentConverter.buildAlignmentHeader(samReader.getFileHeader(), studyId);
    }

    public Path calculateBigWigCoverage() throws IOException {
        return calculateBigWigCoverage(Paths.get(this.bamFile.toFile().getAbsolutePath() + COVERAGE_BIGWIG_EXTENSION), DEFAULT_WINDOW_SIZE);
    }

    public Path calculateBigWigCoverage(Path bigWigPath, int windowSize) throws IOException {
        checkBaiFileExists();
        FileUtils.checkDirectory(bigWigPath.toAbsolutePath().getParent(), true);

        // Check windowsSize is valid, it must be greater or equal than 1
        if (windowSize < 1) {
            windowSize = DEFAULT_WINDOW_SIZE;
        }
        String windowSizeStr = String.valueOf(windowSize);

        // Execute the bamCoverage utility from deepTools package, assuming it is installed in the system
        // deepTools installation: pip install deepTools
        ProcessBuilder processBuilder = new ProcessBuilder(
                Arrays.asList("bamCoverage", "-b", bamFile.toString(), "-o", bigWigPath.toString(), "-of", "bigwig", "-bs", windowSizeStr));
        Process p = processBuilder.start();
        BufferedReader processBufferedReader = new BufferedReader(new InputStreamReader(p.getInputStream()));
        String line;
        while ((line = processBufferedReader.readLine()) != null) {
            logger.info(line);
        }

        return bigWigPath;
    }


    public String header() {
        return samReader.getFileHeader().getTextHeader();
    }

    public byte[] compressedHeader() {
        OutputStream outputStream = new ByteArrayOutputStream();
        BAMFileWriter.writeHeader(outputStream, samReader.getFileHeader());
        return ((ByteArrayOutputStream) outputStream).toByteArray();
    }

    /*
     * These methods aim to provide a very simple, safe and quick way of accessing to a small fragment of the BAM/CRAM file.
     * This must not be used in production for reading big data files. It returns a maximum of 50,000 SAM records,
     * you can use iterator methods for reading more reads.
     *
     */
    public List query(AlignmentFilters filters) throws IOException {
        return query(null, filters, null, SAMRecord.class);
    }

    public List query(AlignmentFilters filters, AlignmentOptions options) throws IOException {
        return query(null, filters, options, SAMRecord.class);
    }

    public  List query(AlignmentFilters filters, AlignmentOptions options, Class clazz) throws IOException {
        return query(null, filters, options, clazz);
    }

    public List query(Region region) throws IOException {
        return query(region, null, new AlignmentOptions(), SAMRecord.class);
    }

    public List query(Region region, AlignmentOptions options) throws IOException {
        return query(region, null, options, SAMRecord.class);
    }

    public List query(Region region, AlignmentFilters filters, AlignmentOptions options) throws IOException {
        return query(region, filters, options, SAMRecord.class);
    }

    public  List query(Region region, AlignmentFilters filters, AlignmentOptions options, Class clazz) throws IOException {
        if (options == null) {
            options = new AlignmentOptions();
        }

        // Set number of returned records up to MAX_NUM_RECORDS, if not set then MAX_NUM_RECORDS is returned
        int maxNumberRecords = MAX_NUM_RECORDS;
        if (options.getLimit() > 0) {
            maxNumberRecords = Math.min(options.getLimit(), MAX_NUM_RECORDS);
        }

        List results = new ArrayList<>(maxNumberRecords);
        BamIterator bamIterator = (region != null)
                ? iterator(region, filters, options, clazz)
                : iterator(filters, options, clazz);

        while (bamIterator.hasNext() && results.size() < maxNumberRecords) {
            results.add(bamIterator.next());
        }
        bamIterator.close();
        return results;
    }


    /*
     * These methods aim to provide a very simple, safe and quick way of iterating BAM/CRAM files.
     */
    public BamIterator iterator() throws IOException {
        return iterator(null, new AlignmentOptions(), SAMRecord.class);
    }

    public BamIterator iterator(AlignmentOptions options) throws IOException {
        return iterator(null, options, SAMRecord.class);
    }

    public BamIterator iterator(AlignmentFilters filters, AlignmentOptions options) throws IOException {
        return iterator(filters, options, SAMRecord.class);
    }

    public  BamIterator iterator(AlignmentFilters filters, AlignmentOptions options, Class clazz) throws IOException {
        checkBaiFileExists();

        SAMRecordIterator samRecordIterator = samReader.iterator();
        return getAlignmentIterator(filters, options, clazz, samRecordIterator);
    }

    public BamIterator iterator(Region region) throws IOException {
        return iterator(region, null, new AlignmentOptions(), SAMRecord.class);
    }

    public BamIterator iterator(Region region, AlignmentOptions options) throws IOException {
        return iterator(region, null, options, SAMRecord.class);
    }

    public BamIterator iterator(Region region, AlignmentFilters filters, AlignmentOptions options) throws IOException {
        return iterator(region, filters, options, SAMRecord.class);
    }

    public  BamIterator iterator(Region region, AlignmentFilters filters, AlignmentOptions options, Class clazz)
            throws IOException {
        checkBaiFileExists();

        if (options == null) {
            options = new AlignmentOptions();
        }
        // Sanity check
        BamUtils.validateRegion(region, samReader);

        SAMRecordIterator samRecordIterator =
                samReader.query(region.getChromosome(), region.getStart(), region.getEnd(), options.isContained());
        return getAlignmentIterator(filters, options, clazz, samRecordIterator);
    }

    private  BamIterator getAlignmentIterator(AlignmentFilters filters, AlignmentOptions alignmentOptions, Class clazz,
                                                    SAMRecordIterator samRecordIterator) {
        if (alignmentOptions == null) {
            alignmentOptions = new AlignmentOptions();
        }

        int limit = -1;
        if (alignmentOptions.getLimit() > 0) {
            limit = alignmentOptions.getLimit();
        }

        if (ReadAlignment.class == clazz) {
            // AVRO
            return (BamIterator) new SAMRecordToAvroReadAlignmentBamIterator(samRecordIterator, filters, alignmentOptions.isBinQualities(), limit);
        } else if (Reads.ReadAlignment.class == clazz) {
            // PROTOCOL BUFFER
            return (BamIterator) new SAMRecordToProtoReadAlignmentBamIterator(samRecordIterator, filters, alignmentOptions.isBinQualities(), limit);
        } else if (SAMRecord.class == clazz) {
            return (BamIterator) new SamRecordBamIterator(samRecordIterator, filters, limit);
        } else {
            throw new IllegalArgumentException("Unknown alignment model class: " + clazz);
        }
    }

    public List getChunks(Region region) {
        if (samReader.hasIndex()) {
            int sequenceIndex = samReader.getFileHeader().getSequenceIndex(region.getChromosome());
            int start = region.getStart();
            int end = region.getEnd();

            BAMIndex index = samReader.indexing().getIndex();
            return index.getSpanOverlapping(sequenceIndex, start, end).getChunks();
        }
        return null;
    }

    public List getBreakpoints(Region region) throws IOException {
        try (BlockCompressedInputStream blockCompressedInputStream = new BlockCompressedInputStream(bamFile.toFile())) {
            if (samReader.hasIndex()) {
                List originalChunks = getChunks(region);
                long lastEndPosition = -1;

                if (CollectionUtils.isNotEmpty(originalChunks)) {
                    List byteRanges = new ArrayList<>(originalChunks.size());
                    for (Chunk originalChunk : originalChunks) {

                        long seekInitialPos = originalChunk.getChunkEnd();
                        if (seekInitialPos < lastEndPosition) {
                            // Skip, this chunk has already been included
                            continue;
                        }
                        // We put the file pointer at the beginning of the end chunk
                        blockCompressedInputStream.seek(seekInitialPos);

                        // And start reading bytes until we reach the end of the block
                        long virtualEndPos = seekInitialPos;
                        while (!blockCompressedInputStream.endOfBlock()) {
                            blockCompressedInputStream.read();
                            virtualEndPos = blockCompressedInputStream.getPosition();
                        }

                        // Update the lastEndPosition retrieved to avoid duplication breakpoints
                        lastEndPosition = virtualEndPos;

                        // Write the start and the retrieved end addresses
                        byteRanges.add(BlockCompressedFilePointerUtil.getBlockAddress(originalChunk.getChunkStart())
                                + "-" + (BlockCompressedFilePointerUtil.getBlockAddress(virtualEndPos) - 1));
                    }

                    return byteRanges;
                }
            }
        }
        return null;
    }

    /**
     * Return the coverage average given a window size from a BigWig file. This is expected to have the same name
     * that the BAM file with .bw suffix.
     * If no BigWig file is found and windowSize is 1 then we calculate te coverage from the BAM file.
     * @param region Region from which return the coverage
     * @param windowSize Window size to average
     * @return One average score per window size spanning the region
     * @throws IOException If any error happens reading BigWig file
     */
    public RegionCoverage coverage(Region region, int windowSize) throws IOException, AlignmentCoverageException {
        if (Paths.get(bamFile.toString() + COVERAGE_BIGWIG_EXTENSION).toFile().exists()) {
            return BamUtils.getCoverageFromBigWig(region, windowSize, Paths.get(this.bamFile.toString() + COVERAGE_BIGWIG_EXTENSION));
        } else {
            // If BigWig file is not found and windowSize is 1 then we calculate it from the BAM file
            if (windowSize == 1) {
                return coverage(region, null, new AlignmentOptions());
            } else {
                throw new AlignmentCoverageException("No bigwig file has been found and windowSize is > 1");
            }
        }
    }

    /**
     * This method get some filters and calculate the coverage from the BAM file with the reads filtered.
     * @param region Region to calculate coverage from
     * @param filters Filters to be applied to reads
     * @param options Other possible options
     * @return The coverage for each position of the region (windowSize == 1)
     */
    public RegionCoverage coverage(Region region, AlignmentFilters filters, AlignmentOptions options)
            throws AlignmentCoverageException {
        // Check region size is smaller than MAX_REGION_COVERAGE
        if (region.size() > MAX_REGION_COVERAGE) {
            throw new AlignmentCoverageException("Region size is bigger than MAX_REGION_COVERAGE [" + MAX_REGION_COVERAGE +"]");
        }

        if (options == null) {
            options = new AlignmentOptions();
        }

        SamRecordRegionCoverageCalculator calculator = new SamRecordRegionCoverageCalculator(options.getMinBaseQuality());
        try (BamIterator iterator = iterator(region, filters, options)) {
            RegionCoverage regionCoverage = new RegionCoverage(region);
            while (iterator.hasNext()) {
                SAMRecord next = iterator.next();
                if (!next.getReadUnmappedFlag()) {
                    calculator.update(next, regionCoverage);
                }
            }
            regionCoverage.updateStats();
            return regionCoverage;
        } catch (Exception e) {
            throw new AlignmentCoverageException(e.getMessage(), e);
        }
    }

    public AlignmentGlobalStats stats() throws IOException {
        return calculateGlobalStats(iterator());
    }

    public AlignmentGlobalStats stats(Region region, AlignmentFilters filters, AlignmentOptions options) throws IOException {
        return calculateGlobalStats(iterator(region, filters, options));
    }

    private AlignmentGlobalStats calculateGlobalStats(BamIterator iterator) throws IOException {
        AlignmentGlobalStats alignmentGlobalStats = new AlignmentGlobalStats();
        SamRecordAlignmentGlobalStatsCalculator calculator = new SamRecordAlignmentGlobalStatsCalculator();
        while (iterator.hasNext()) {
            AlignmentGlobalStats computed = calculator.compute(iterator.next());
            calculator.update(computed, alignmentGlobalStats);
        }
        iterator.close();
        return alignmentGlobalStats;
    }

    @Override
    public void close() throws IOException {
        if (samReader != null) {
            samReader.close();
        }
    }

    private void checkBaiFileExists() throws IOException {
        if (bamFile.toString().endsWith(".bam") && !new File(bamFile.toString() + ".bai").exists()) {
            throw new IOException("Missing BAM index (.bai file) for " + bamFile.toString());
        } else if (bamFile.toString().endsWith(".cram") && !new File(bamFile.toString() + ".crai").exists()) {
            throw new IOException("Missing CRAM index (.crai file) for " + bamFile.toString());
        }
    }

    public Path getBamFile() {
        return bamFile;
    }

    public BamManager setBamFile(Path bamFilePath) {
        this.bamFile = bamFilePath;
        return this;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy