All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.Nucleotide Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.utils;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * Represents the nucleotide alphabet with support for IUPAC ambiguity codes.
 *
 * 

* This enumeration not only contains standard (non-ambiguous) nucleotides, but also * contains ambiguous nucleotides, as well as a code {@link #X} (a.k.a. {@link #INVALID}) * for invalid nucleotide calls. *

* *

* You can query whether a value refers to a non-ambiguous nucleotide with {@link #isStandard()} or * {@link #isAmbiguous()} whichever is most convenient. Notice that the special value {@link #X} * is neither of those. *

* *

* Querying the {@link #X} value for its {@link #complement}, {@link #transition} or * {@link #transversion} or using it in other operations * such as {@link #intersect} will return {@link #X}; similar to {@link Double#NaN} in * {@code double} arithmetic. *

* *

* For naming consistency it is recommended to use {@link #decode} and {@link #encodeAsByte} * or {@link #encodeAsString} methods to translate byte/char and string encodings from and * into values of this enum over the inherited {@link #toString}, {@link #name} or {@link #valueOf}. *

* *

* Although the canonical names for values use the single letter IUPAC * encodings, this class provides convenient longer form names constant aliases * (e.g. {@link #ADENINE} for {@link #A}, {@link #PURINE} for {@link #R}, etc.). *

*

* Uracil and Thymine are considered equivalent in this enum with {@link #T} as the canonical name. *

*

* Finally, notice that there is no code of the "gap nucleotide" that may appear in aligned sequences as in fact * that is not a nucleotide. A base encoding using the typical gap representation such as '.' or '-' would * be interpreted as an {@link #INVALID} (i.e. {@link #X}) call which is probably not what you want. * So code to support those will need to do so outside this {@code enum}. *

* @author Valentin Ruano-Rubio <[email protected]> */ public enum Nucleotide { // Standard nucleotide codes, // and their one-bit-encoding masks CODE(0bTGCA): A(0b0001), C(0b0010), G(0b0100), T(0b1000), // Extended codes: // CODE(included nucs) R(A, G), // Purines. Y(C, T), // Pyrimidines. S(C, G), // Strong nucleotides. W(A, T), // Weak nucleotides. K(G, T), // Keto nucleotides. M(A, C), // Amino nucleotides. // The following 4 tri-nucleotide codes don't have a proper long name, they are simply "all-except-one" // codes: B(C, G, T), // Not-A (B follows A) D(A, G, T), // Not-C (D follows C) H(A, C, T), // Not-G (H follows G) V(A, C, G), // Not-V (V follows T) // Any N(A, C, G, T), // Any/Unknown // And X/invalid-call: X(); // As far as this enum is concern, // references to Uracil (U) are considered equivalent to Thymine (T) as they are transcription equivalent. // nucleotides. public static final Nucleotide U = T; // Convenient constants with long form alternative names for some of the enumeration values: // Long form standard nucleotide names. public static final Nucleotide ADENINE = A; public static final Nucleotide CYTOSINE = C; public static final Nucleotide GUANINE = G; public static final Nucleotide THYMINE = T; public static final Nucleotide URACIL = U; // Ambiguous nucleotide groups with proper long form names: public static final Nucleotide STRONG = S; public static final Nucleotide WEAK = W; public static final Nucleotide PURINE = R; public static final Nucleotide PYRIMIDINE = Y; public static final Nucleotide AMINO = M; public static final Nucleotide KETO = K; public static final Nucleotide ANY = N; public static final Nucleotide INVALID = X; /** * List of the standard (non-redundant) nucleotide values in their preferred alphabetical order. */ public static final List STANDARD_BASES = Collections.unmodifiableList(Arrays.asList(A, C, G, T)); // Since calling values() is costly (creates a new array every time) and often we do it just to find out the // total number of constants is best to cache it in a constant. private static final int NUMBER_OF_CONSTANTS; /** * Values indexed by their unsigned byte encodings. Non-valid encodings point to {@link #INVALID}. */ private static final Nucleotide[] baseToValue; /** * Values indexed by their mask. */ private static final Nucleotide[] maskToValue; /** * Value ordinal indexed by their unsigned byte ecodings. Non-valid encodings point to {@link #INVALID} * (thru its ordinal). */ private static final int[] baseToOrdinal; static { final Nucleotide[] values = values(); NUMBER_OF_CONSTANTS = values.length; baseToValue = new Nucleotide[1 << Byte.SIZE]; maskToValue = new Nucleotide[1 << STANDARD_BASES.size()]; baseToOrdinal = new int[1 << Byte.SIZE]; Arrays.fill(baseToValue, INVALID); Arrays.fill(baseToOrdinal, INVALID.ordinal()); for (final Nucleotide nucleotide : values) { // Notice that {@code "x & 0xFF"} is needed instead of {@code "(int)x" as // we want the unsigned value (e.g. 255 rather than -1). // This is repeated through this class code. final int lowerCaseIndex = nucleotide.lowerCaseByteEncoding & 0xFF; final int upperCaseIndex = nucleotide.upperCaseByteEncoding & 0xFF; maskToValue[nucleotide.mask] = nucleotide; baseToValue[lowerCaseIndex] = baseToValue[upperCaseIndex] = nucleotide; baseToOrdinal[lowerCaseIndex] = baseToOrdinal[upperCaseIndex] = nucleotide.ordinal(); } // need to do u and U here as they are just aliases to T. baseToValue['u' & 0xFF] = baseToValue['U' & 0xFF] = U; baseToOrdinal['u' & 0xFF] = baseToOrdinal['U' & 0xFF] = U.ordinal(); } private final int mask; private final boolean isStandard; // Some properties initialized after construction as these depend on some static arrays // defined above. private Nucleotide complement; private Nucleotide transition; private Nucleotide transversion; static { for (final Nucleotide value : values()) { value.finalizeInitialization(); } } /** * Holds the lower-case byte encoding for this nucleotide. * This is typically the lower-case version of the enum constant name. */ private final byte lowerCaseByteEncoding; /** * Holds the lower-case {@link String} representation for this nucleotide. * This is typically the lower-case version of the enum constant name. */ private final String lowerCaseStringEncoding; /** * Holds the lower-case {@code char} representation for this nucleotide. * This is typically the lower-case version of the enum constant name only character. */ private final char lowerCaseCharEncoding; /** * Holds the upper-case byte encoding for this nucleotide. * This is typically the upper-case version of the enum constant name. */ private final byte upperCaseByteEncoding; /** * Holds the upper-case {@code char} representation for this nucleotide. * This is typically the upper-case version of the enum constant name only character. */ private final char upperCaseCharEncoding; /** * Construct a nucleotide given its mask. * @param mask the mask. */ Nucleotide(final int mask) { this.mask = mask; isStandard = Integer.bitCount(mask & 0b1111) == 1; lowerCaseByteEncoding = (byte) Character.toLowerCase(name().charAt(0)); lowerCaseCharEncoding = Character.toLowerCase(name().charAt(0)); upperCaseByteEncoding = (byte) Character.toUpperCase(name().charAt(0)); upperCaseCharEncoding = Character.toUpperCase(name().charAt(0)); lowerCaseStringEncoding = name().toLowerCase(); } /** * Construct a nucleotide given the other codes that it would include. * @param nucs the nucleotides to include. */ Nucleotide(final Nucleotide ... nucs) { this(Arrays.stream(nucs).mapToInt(nuc -> nuc.mask).reduce((a, b) -> a | b).orElse(0)); } /** * Returns the {@code byte} typed encoding that corresponds to this nucleotide. * @param upperCase whether to return the upper- or lower-case {@code byte} representation. * @return a valid and exclusive {@code byte} representation for a nucleotide. */ public byte encodeAsByte(final boolean upperCase) { return upperCase ? upperCaseByteEncoding : lowerCaseByteEncoding; } /** * Returns the {@code char} typed encoding that corresponds to this nucleotide. * @param upperCase whether to return the upper- or lower-case {@code char} representation. * @return a valid and exclusive {@code char} representation for a nucleotide. */ public char encodeAsChar(final boolean upperCase) { return upperCase ? upperCaseCharEncoding : lowerCaseCharEncoding; } /** * Returns this nucleotide's exclusive upper-case {@code byte} encoding. * @return ditto. */ public byte encodeAsByte() { return upperCaseByteEncoding; } /** * Returns the nucleotide's exclusive upper-case {@code char} encoding. * @return ditto. */ public char encodeAsChar() { return upperCaseCharEncoding; } /** * Returns the nucleotide's exclusive upper-case {@code String} encoding. * @return ditto. */ public String encodeAsString() { return toString(); } /** * Returns the nucleotide's exclusive {@link String} typed encoding. * @param upperCase whether the upper or lower-case representation should be returned. * @return a valid and exclusive {@link String} representation for this nucleotide. */ public String encodeAsString(final boolean upperCase) { return upperCase ? toString() : lowerCaseStringEncoding; } /** * Returns the nucleotide that corresponds to a particular {@code byte} typed base code. * @param base the query base code. * @return never {@code null}, but {@link #INVALID} if the base code does not * correspond to a valid nucleotide specification. */ public static Nucleotide decode(final byte base) { return baseToValue[base & 0xFF]; } /** * Returns the nucleotide that corresponds to a particular {@code char} typed base code. * @param ch the query base code. * @return never {@code null}, but {@link #INVALID} if the base code does not correspond * to a valid nucleotide specification. */ public static Nucleotide decode(final char ch) { if ((ch & 0xFF00) != 0) { return INVALID; } else { return baseToValue[ch & 0xFF]; } } /** * Transform a single-letter character string into the corresponding nucleotide. *

* {@code Null}, empty or multi-letter input will result in an {@link IllegalArgumentException}. * These are not simply invalid encodings as the fact that are not a single character is * an indication of a probable bug. *

* * @param seq the input character sequence to transform into. * @return never {@code null}, perhaps {@link #INVALID} to indicate that the input is not a valid * single letter encoding encoding. */ public static Nucleotide decode(final CharSequence seq) { Utils.nonNull(seq, "the input character sequence must not be null"); if (seq.length() != 1) { throw new IllegalArgumentException("the input character sequence must be exactly one character long"); } else { return decode(seq.charAt(0)); } } /** * Checks whether the nucleotide refers to a concrete (rather than ambiguous) base. * @return {@code true} iff this is a concrete nucleotide. */ public boolean isStandard() { return isStandard; } /** * Checks whether the nucleotide refer to an ambiguous base. * @return {@code true} iff this is an ambiguous nucleotide. */ public boolean isAmbiguous() { return !isStandard && this != INVALID; } /** * Whether this nucleotide code is valid or not. * @return {@code true} iff valid. */ public boolean isValid() { return this != INVALID; } /** * Checks whether this nucleotide code encloses all possible nucleotides for another code. * @param other the other nucleotide to compare to. * @return {@code true} iff every nucleotide in {@code other} is enclosed in this code. */ public boolean includes(final Nucleotide other) { Utils.nonNull(other); return other != INVALID && (mask & other.mask) == other.mask; } /** * Checks whether this nucleotide code encloses all possible nucleotides for another code. * @param b the other nucleotide to compare to encoded as a byte. * @return {@code true} iff every nucleotide in {@code other} is enclosed in this code. */ public boolean includes(final byte b) { return includes(decode(b)); } /** * Returns the nucleotide code that include all and only the nucleotides that are * included by this another code. * @param other the other nucleotide code. * @throws IllegalArgumentException if {@code other} is {@code null}. * @return never {@code null}. Returns {@link #INVALID} if the intersection does not contain * any nucleotide. */ public Nucleotide intersect(final Nucleotide other) { Utils.nonNull(other, "the other nucleotide cannot be null"); return maskToValue[mask & other.mask]; } /** * Checks whether two nucleotides intersect given their byte encodings. * @param a first nucleotide. * @param b second nucleotide. * @return {@code true} iff the input nucleotides intersect. */ public static boolean intersect(final byte a, final byte b) { return (baseToValue[0xFF & a].mask & baseToValue[0xFF & b].mask) != 0; } /** * Checks whether two base encodings make reference to the same {@link #Nucleotide} * instance regardless of their case. *

* This method is a shorthand for: *

{@link #decode}(a){@link #same(Nucleotide) same}({@link #decode}(b)) 
. *

* *

* The order of the inputs is not relevant, therefore {@code same(a, b) == same(b, a)} for any * given {@code a} and {@code b}. *

*

* Notice that if either or both input bases make reference to an invalid nucleotide (i.e.

 {@link #decode}(x) == {@link #INVALID}},
     *      this method will return {@code false} even if {@code a == b}.
     *  

* @param a the first base to compare (however order is not relevant). * @param b the second base to compare (however order is not relevant). * @return {@code true} iff {@code {@link #decode}}.same({@link #decode}(b))}} */ public static boolean same(final byte a, final byte b) { return baseToValue[a & 0xFF] == baseToValue[b & 0xFF] && baseToValue[a & 0xFF] != INVALID; } /** * Checks whether this and another {@link #Nucleotide} make reference to the same nucleotide(s). *

* In contrast with {@link #equals}, this method will return {@code false} if any of the two, this * or the input nucleotide is the {@link #INVALID} enum value. So even

{@link #INVALID}.same({@link #INVALID})
* will return {@code false}. *

* * @param other the other nucleotide. * @return {@code true} iff this and the input nucleotide make reference to the same nucleotides. */ public boolean same(final Nucleotide other) { return this == other && this != INVALID; } /** * Returns the complement nucleotide code for this one. *

* For ambiguous nucleotide codes, this will return the ambiguous code that encloses the complement of * each possible nucleotide in this code. *

*

* The complement of the {@link #INVALID} nucleotide is itself. *

* @return never {@code null}. */ public Nucleotide complement() { return complement; } /** * Returns the complement for a base code. *

* When an invalid base is provided this method will return the input byte (lower- or upper-cased depending on that * flag value). *

* @param b the input base * @param upperCase whether to return the uppercase ({@code true}) or the lower case ({@code false}) byte encoding. * @return the complement of the input. */ public static byte complement(final byte b, final boolean upperCase) { final Nucleotide compl = baseToValue[b & 0xFF].complement; return compl != INVALID ? (upperCase ? compl.upperCaseByteEncoding : compl.lowerCaseByteEncoding) : (byte) ( upperCase ? Character.toUpperCase(b) : Character.toLowerCase(b)); } /** * Returns the complement for a base code. *

* The case of the output will match the case of the input. *

*

* When an invalid base is provided this method will return the input base byte. *

* @param b the input base * @return the complement of the input. */ public static byte complement(final byte b) { final Nucleotide compl = baseToValue[b & 0xFF].complement; return compl != INVALID ? (Character.isUpperCase(b) ? compl.upperCaseByteEncoding : compl.lowerCaseByteEncoding) : b; } /** * Returns the instance that would include all possible transition mutations from this one. * @return never {@code null}. */ public Nucleotide transition() { return transition; } /** * Returns the instance that would include all possible tranversion mutations from nucleotides included * in this one. * @return never {@code null}. */ public Nucleotide transversion() { return transversion; } /** * Calculate and set the complement, transition and transversion using the #maskToValue array. */ private void finalizeInitialization() { // set the complement. final int complementMask = ((mask & A.mask) != 0 ? T.mask : 0) | ((mask & T.mask) != 0 ? A.mask : 0) | ((mask & C.mask) != 0 ? G.mask : 0) | ((mask & G.mask) != 0 ? C.mask : 0); complement = maskToValue[complementMask]; // set the transversion. final int transversionMask = ((mask & PURINE.mask) != 0 ? PYRIMIDINE.mask : 0) | ((mask & PYRIMIDINE.mask) != 0 ? PURINE.mask : 0); transversion = maskToValue[transversionMask]; // set the transition. final int transitionMask = ((mask & A.mask) != 0 ? G.mask : 0) | ((mask & G.mask) != 0 ? A.mask : 0) | ((mask & C.mask) != 0 ? T.mask : 0) | ((mask & T.mask) != 0 ? C.mask : 0); transition = maskToValue[transitionMask]; } /** * Transversion mutation toward a strong or a weak base. *

* This method provides a non-ambiguous alternative to {@link #transversion()} for * concrete nucleotides. *

* * @param strong whether the result should be a strong ({@code S: G, C}) or weak ({@code W: A, T}) nucleotide(s). * @return nucleotides that may emerged from such a transversion. */ public Nucleotide transversion(final boolean strong) { return transversion.intersect(strong ? STRONG : WEAK); } /** * Helper class to count the number of occurrences of each nucleotide code in * a sequence. */ public static final class Counter { private final long[] counts; /** * Creates a new counter with all counts set to 0. */ public Counter() { counts = new long[NUMBER_OF_CONSTANTS]; } /** * Increases by 1 the count for a nucleotide. * @param nucleotide the target nucleotide. * @throws IllegalArgumentException if nucleotide is {@code null}. */ public void add(final Nucleotide nucleotide) { counts[Utils.nonNull(nucleotide).ordinal()]++; } /** * Increases the nucleotide that corresponds to the input base own count by 1. * @param base the base code. * @throws IllegalArgumentException if {@code base} is {@code negative}. */ public void add(final byte base) { counts[baseToOrdinal[base & 0xFF]]++; } public void add(final char base) { if ((base & 0xFF00) != 0) { counts[INVALID.ordinal()]++; } else { counts[baseToOrdinal[base & 0xFF]]++; } } /** * Returns the current count for a given nucleotide. * @param nucleotide the query nucleotide. * @throws IllegalArgumentException if {@code nucleotide} is {@code null}. * @return 0 or greater. */ public long get(final Nucleotide nucleotide) { return counts[Utils.nonNull(nucleotide).ordinal()]; } /** * Increase by one the count for a nucleotide for each * occurrence of such in the input byte array base codes. * @param bases the input base codes. * @throws IllegalArgumentException if {@code bases} is null. */ public final void addAll(final byte ... bases) { Utils.nonNull(bases); for (final byte base : bases) { counts[baseToOrdinal[base & 0xFF]]++; } } /** * Increase by one the count for a nucleotide for each * occurrence of such in the input char array base codes. * @param bases the input base codes. * @throws IllegalArgumentException if {@code bases} is null. */ public final void addAll(final char ... bases) { Utils.nonNull(bases); for (final char base : bases) { if ((base & 0xFF00) != 0) { counts[INVALID.ordinal()]++; } else { counts[baseToOrdinal[base & 0xFF]]++; } } } /** * Increase by one the count for a nucleotide for each * occurrence of such in the input {@link CharSequence}. * @param bases the input bases sequence. * @throws IllegalArgumentException if the input is {@code null}. */ public final void addAll(final CharSequence bases) { Utils.nonNull(bases); for (int i = 0; i < bases.length(); i++) { final char base = bases.charAt(i); if ((base & 0xFF00) != 0) { counts[INVALID.ordinal()]++; } else { counts[baseToOrdinal[base & 0xFF]]++; } } } /** * Reset all the counts to 0. */ public void clear() { Arrays.fill(counts, 0); } /** * Return the total count of all nucleotide constants. * @return 0 or greater. */ public long sum() { return MathUtils.sum(counts); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy