All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.utils.ReadFunctions Maven / Gradle / Ivy

/*
 * Copyright 2015 Google.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.genomics.dataflow.utils;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.genomics.dataflow.model.ReadBaseQuality;
import com.google.cloud.genomics.dataflow.model.ReadBaseWithReference;
import com.google.common.collect.ImmutableList;
import com.google.genomics.v1.CigarUnit;
import com.google.genomics.v1.Position;
import com.google.genomics.v1.Read;

/**
 * Utility methods for working with genetic read data.
 */
public class ReadFunctions {

  private static final String UNINITIALIZED_REFERENCE_SEQUENCE = "UNINITIALIZED";

  /**
   * Use the given read to build a list of aligned read and reference base
   * information.
   *
   * @param read The read with the alignment information
   * @return read and reference information.  For a read without an alignment
   *            or cigar units, null is returned.
   */
  public static List extractReadBases(Read read) {

    // Make sure this read has a valid alignment with Cigar Units
    if (!read.hasAlignment() || (read.getAlignment().getCigarCount() == 0)) {
      return null;
    }

    ImmutableList.Builder bases = ImmutableList.builder();

    String readSeq = read.getAlignedSequence();
    List readQual = read.getAlignedQualityList();
    String refSeq = UNINITIALIZED_REFERENCE_SEQUENCE;

    int refPosAbsoluteOffset = 0;
    int readOffset = 0;
    for (CigarUnit unit : read.getAlignment().getCigarList()) {
      switch(unit.getOperation()) {
        case ALIGNMENT_MATCH:
        case SEQUENCE_MISMATCH:
        case SEQUENCE_MATCH:
          for (int i = 0; i < unit.getOperationLength(); i++) {
            String refBase = "";
            if (unit.getOperation().equals(CigarUnit.Operation.SEQUENCE_MATCH)) {
              refBase = readSeq.substring(readOffset, readOffset + 1);
            } else if (!unit.getReferenceSequence().isEmpty()) {
              // try to get the ref sequence from the Cigar unit
              refBase = unit.getReferenceSequence().substring(i, i + 1);
            } else {
              // try to get the ref sequence by fully parsing the MD tag if not already cached
              if (refSeq != null && refSeq.equals(UNINITIALIZED_REFERENCE_SEQUENCE)) {
                refSeq = com.google.cloud.genomics.utils.grpc.ReadUtils
                    .inferReferenceSequenceByParsingMdFlag(read);
              }
              if (refSeq != null) {
                refBase = refSeq.substring(readOffset, readOffset + 1);
              }
            }
            String name = read.getAlignment().getPosition().getReferenceName();
            Matcher m = Pattern.compile("^(chr)?(X|Y|([12]?\\d))$").matcher(name);
            if (m.matches()) {
              name = m.group(m.groupCount() - 1);
            }
            Position refPosition = Position.newBuilder()
                .setReferenceName(name)
                .setPosition(read.getAlignment().getPosition().getPosition()
                  + refPosAbsoluteOffset)
                  .build();
            bases.add(new ReadBaseWithReference(new ReadBaseQuality(
                readSeq.substring(readOffset, readOffset + 1),
                readQual.get(readOffset)),
                refBase,
                refPosition));
            refPosAbsoluteOffset++;
            readOffset++;
          }
          break;

        case PAD:        // padding (silent deletion from padded reference)
        case CLIP_HARD:  // hard clipping (clipped sequences NOT present in seq)
          break;

        case CLIP_SOFT:  // soft clipping (clipped sequences present in SEQ)
        case INSERT:     // insertion to the reference
          readOffset += unit.getOperationLength();
          break;

        case DELETE:  // deletion from the reference
        case SKIP:    // intron (mRNA-to-genome alignment only)
          refPosAbsoluteOffset += unit.getOperationLength();
          break;

        default:
          throw new IllegalArgumentException("Illegal cigar code: " + unit.getOperation());
      }
    }

    return bases.build();
  }

  /**
   * Make sure a read is from a chromosome.
   */
  public static final SerializableFunction IS_ON_CHROMOSOME
      = new SerializableFunction() {
        @Override
        public Boolean apply(Read r) {
          return Pattern.compile("^(chr)?(X|Y|([12]?\\d))$")
              .matcher(r.getAlignment().getPosition().getReferenceName())
              .matches();
        }
      };

  /**
   * Make sure a read passed QC.
   */
  public static final SerializableFunction IS_NOT_QC_FAILURE
      = new SerializableFunction() {
        @Override
        public Boolean apply(Read r) {
          return !r.getFailedVendorQualityChecks();
        }
      };

  /**
   * Make sure a read is not a duplicate.
   */
  public static final SerializableFunction IS_NOT_DUPLICATE
      = new SerializableFunction() {
        @Override
        public Boolean apply(Read r) {
          return !r.getDuplicateFragment();
        }
      };

  /**
   * Make sure the read alignment is proper (paired end reads facing each other with acceptable
   * fragment size).
   */
  public static final SerializableFunction IS_PROPER_PLACEMENT
      = new SerializableFunction() {
        @Override
        public Boolean apply(Read r) {
          return r.getProperPlacement();
        }
      };
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy