All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.readers.bam.SeekableGCSStream Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.readers.bam;

import com.google.api.client.http.HttpResponse;
import com.google.api.services.storage.Storage;
import com.google.api.services.storage.Storage.Objects.Get;
import com.google.api.services.storage.model.StorageObject;

import htsjdk.samtools.seekablestream.SeekableStream;

import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

/**
 * Adapter of GCS to look like HTSJDK SeekableStream so BAM readers
 * can stream data from GCS.
 * TODO: implement more sophisticated retries with backoff.
 */
public class SeekableGCSStream extends SeekableStream {
  private static final Logger LOG = Logger.getLogger(SeekableGCSStream.class.getName());
  
  private static final Pattern SLASH = Pattern.compile("/");
  private static final String GCS_PREFIX = "gs://";
  private static final int MAX_RETRIES = 3;
  private static final long RETRY_SLEEP_TIME_MSEC = 1000;
  
  private Storage.Objects client;
  private StorageObject object;
  private String name;
  private Get get;
  private InputStream stream = null;
  private long size = -1;
  private long position = -1;
  private byte[] oneByte = new byte[1];
  private boolean atEof = false;
  
  public SeekableGCSStream(Storage.Objects client, String name) throws IOException {
    LOG.info("Creating SeekableGCSStream: " + name);
    this.client = client;
    object = uriToStorageObject(name);
    get = this.client.get(object.getBucket(), object.getName());
    seek(0);
  }

  @Override
  public void close() throws IOException {
    if (stream != null) {
      stream.close();
      stream = null;
    }
  }

  @Override
  public boolean eof() throws IOException {
    return atEof;
  }

  @Override
  public String getSource() {
    return name;
  }

  @Override
  public long length() {
    return size;
  }

  @Override
  public long position() throws IOException {
    return position;
  }
  
  @Override
  public int read() throws IOException {
    final int bytesRead = read(oneByte, 0, 1);
    return bytesRead==1 ? oneByte[0] : -1;
  }

  @Override
  public void seek(long arg0) throws IOException {
    if (arg0 == position) {
      return;
    }
    validatePosition(arg0);
    position = arg0;
    openStream();
  }
  
  protected void openStream()
      throws IOException {
    if (LOG.isLoggable(Level.FINEST)) {
      LOG.finest("openStream: " + position);
    }
    try {
      close();
    } catch (Exception ex) {
      // Ignore problems in closing the old stream
    }
    validatePosition(position);
    get.getRequestHeaders()
        .setRange(String.format("bytes=%d-", position));
    HttpResponse response;
    try {
      response = get.executeMedia();
    } catch (IOException e) {
      String msg = String.format("Error reading %s at position %d",
          name, position);
      atEof = true;
      throw new IOException(msg, e);
    }
    String contentRange = response.getHeaders().getContentRange();
    if (response.getHeaders().getContentLength() != null) {
      size = response.getHeaders().getContentLength() + position;
    } else if (contentRange != null) {
      String sizeStr = SLASH.split(contentRange)[1];
      try {
        size = Long.parseLong(sizeStr);
      } catch (NumberFormatException e) {
        throw new IOException(
            "Could not determine size from response from Content-Range: " + contentRange, e);
      }
    } else {
      throw new IOException("Could not determine size of response");
    }
    if (LOG.isLoggable(Level.FINEST)) {
      LOG.finest("stream size=" + size);
    }
    atEof = size == 0;
    stream = response.getContent();
  }
  
  protected void validatePosition(long newPosition) {
    if (newPosition < 0) {
      throw new IllegalArgumentException(
          String.format("Invalid seek offset: position value (%d) must be >= 0", newPosition));
    }

    if ((size >= 0) && (newPosition >= size)) {
      throw new IllegalArgumentException(
          String.format(
              "Invalid seek offset: position value (%d) must be between 0 and %d",
              newPosition, size));
    }
  }
  
  @Override
  public int read(byte[] buf, int offset, int len)
      throws IOException {
    if (LOG.isLoggable(Level.FINEST)) {
      LOG.finest("Read at offset " + offset + " length " + len);
    }
    if (len == 0 || offset >= buf.length) {
      return 0;
    }

    int totalBytesRead = 0;
    int retriesAttempted = 0;

    do {
      try {
        final int numBytesRead = stream.read(buf, 
            offset + totalBytesRead, len - totalBytesRead);
        if (LOG.isLoggable(Level.FINEST)) {
          LOG.finest("Read returned: " + numBytesRead);
        }
        if (numBytesRead < 0) {
          atEof = true;
          break;
        }
        totalBytesRead += numBytesRead;
        position += numBytesRead;

        retriesAttempted = 0;
      } catch (IOException ioe) {
        if (retriesAttempted == MAX_RETRIES) {
          LOG.warning(
              String.format("Already attempted max of %d retries while reading '%s'; throwing exception.",
              retriesAttempted, name));
          throw ioe;
        } else {
          ++retriesAttempted;
          LOG.warning(String.format("Got exception: %s while reading '%s'; retry # %d. Sleeping...",
              ioe.getMessage(), name, retriesAttempted));

          try {
            // TODO: implement backoff logic instead of simplistic static sleep interval
            Thread.sleep(RETRY_SLEEP_TIME_MSEC);
          } catch (InterruptedException ie) {
            LOG.warning(String.format("Interrupted while sleeping before retry. Giving up "
                + "after %d retries for '%s'", retriesAttempted, name));
            ioe.addSuppressed(ie);
            throw ioe;
          }
          LOG.info(String.format("Done sleeping before retry for '%s'; retry # %d.",
              name, retriesAttempted));

          openStream();
        }
      }
    } while (totalBytesRead < len);

    final int retVal = (totalBytesRead == 0) ? -1 : totalBytesRead;
    if (LOG.isLoggable(Level.FINEST)) {
      LOG.finest("Read result " + retVal);
    }
    return retVal;
  }
  
  private static StorageObject uriToStorageObject(String uri) throws IOException {
    StorageObject object = new StorageObject();
    if (uri.startsWith(GCS_PREFIX)) {
      uri = uri.substring(GCS_PREFIX.length());
    } else {
      throw new IOException("Invalid GCS path (does not start with gs://): " + uri);
    }
    int slashPos = uri.indexOf("/");
    if (slashPos > 0) {
      object.setBucket(uri.substring(0, slashPos));
      object.setName(uri.substring(slashPos + 1));
      LOG.info("uriToStorageObject " + uri + "=" + object.getBucket() + ":" + object.getName());
    } else {
      throw new IOException("Invalid GCS path (does not have bucket/name form): " + uri);
    }
    return object;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy