All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.readers.bam.BAMShard Maven / Gradle / Ivy

There is a newer version: v1-0.8
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.readers.bam;

import com.google.cloud.genomics.utils.Contig;
import com.google.common.collect.Lists;

import htsjdk.samtools.BAMFileIndexImpl;
import htsjdk.samtools.Chunk;
import htsjdk.samtools.SAMFileSpanImpl;

import java.io.Serializable;
import java.util.List;

/**
 * A shard of BAM data we will create during sharding and then use to drive the reading.
 * We use this class during shard generation, by iteratively building 
 * a shard, extending it bin by bin (@see #addBin)
 * At the end of the process, the shard is finalized (@see #finalize) 
 * and SAMFileSpan that has all the chunks we want to read is produced.
 */
public class BAMShard implements Serializable {
  public String file;
  public SAMFileSpanImpl span;
  public Contig contig;
  public List chunks;
  public long cachedSizeInBytes = -1;

  /**
   * Begins a new shard with an empty chunk list and a starting locus.
   */
  public BAMShard(String file, String referenceName, long firstLocus) {
    this.file = file;
    this.contig = new Contig(referenceName, firstLocus, -1);
    this.chunks = Lists.newLinkedList();
    this.span = null;
  }
  
  /**
   * Creates a shard with a known file span.
   * Such shard is not expected to be extended and calling addBin or finalize on it will fail.
   * This constructor is used for "degenerate" shards like unmapped reads or 
   * all reads in cases where we don't have an index.
   */
  public BAMShard(String file, SAMFileSpanImpl span, Contig contig) {
    this.file = file;
    this.span = span;
    this.contig = contig;
    this.chunks = null;
  }

  /**
   * Appends chunks from another bin to the list and moved the end position.
   */
  public void addBin(List chunksToAdd, long lastLocus) {
    assert chunks != null;
    contig = new Contig(contig.referenceName, contig.start, lastLocus);
    chunks.addAll(chunksToAdd);
    updateSpan();
  }
  
  /**
   * Generates a final list of chunks, now that we know the exact bounding loci
   * for this shard. We get all chunks overlapping this loci, and then ask the index
   * for the chunks overlapping them. 
   */
  public BAMShard finalize(BAMFileIndexImpl index, long lastLocus) {
    contig = new Contig(contig.referenceName, contig.start, lastLocus);
    this.chunks = index.getChunksOverlapping(contig.referenceName, 
        (int)contig.start, (int)contig.end);
    updateSpan();
    return this;
  }
  
  /**
   * Updates the underlying file span by optimizing and coalescing the current chunk list.
   */
  private void updateSpan() {
    span = new SAMFileSpanImpl(Chunk.optimizeChunkList(chunks, this.contig.start));
    cachedSizeInBytes = -1;
  }

  public long sizeInLoci() {
    return contig.end > 0 ? contig.end - contig.start : 0;
  }
  
  public long approximateSizeInBytes() {
    if (cachedSizeInBytes < 0) {
      cachedSizeInBytes = span.approximateSizeInBytes();
    }
    return cachedSizeInBytes;
  }

  @Override
  public String toString() {
    String str = file + ": " + contig.toString() + ", locus size = " + sizeInLoci();
    if (cachedSizeInBytes >= 0) {
      // Only return this as part of the string if it's already cached and calculated.
      // Otherwise calling this function causes the object to be mutated.
      str += ", span size = " + cachedSizeInBytes;
    }
    return str;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy