org.broadinstitute.hellbender.utils.Nucleotide Maven / Gradle / Ivy
package org.broadinstitute.hellbender.utils;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Represents the nucleotide alphabet with support for IUPAC ambiguity codes.
*
*
* This enumeration not only contains standard (non-ambiguous) nucleotides, but also
* contains ambiguous nucleotides, as well as a code {@link #X} (a.k.a. {@link #INVALID})
* for invalid nucleotide calls.
*
*
*
* You can query whether a value refers to a non-ambiguous nucleotide with {@link #isStandard()} or
* {@link #isAmbiguous()} whichever is most convenient. Notice that the special value {@link #X}
* is neither of those.
*
*
*
* Querying the {@link #X} value for its {@link #complement}, {@link #transition} or
* {@link #transversion} or using it in other operations
* such as {@link #intersect} will return {@link #X}; similar to {@link Double#NaN} in
* {@code double} arithmetic.
*
*
*
* For naming consistency it is recommended to use {@link #decode} and {@link #encodeAsByte}
* or {@link #encodeAsString} methods to translate byte/char and string encodings from and
* into values of this enum over the inherited {@link #toString}, {@link #name} or {@link #valueOf}.
*
*
*
* Although the canonical names for values use the single letter IUPAC
* encodings, this class provides convenient longer form names constant aliases
* (e.g. {@link #ADENINE} for {@link #A}, {@link #PURINE} for {@link #R}, etc.).
*
*
* Uracil and Thymine are considered equivalent in this enum with {@link #T} as the canonical name.
*
*
* Finally, notice that there is no code of the "gap nucleotide" that may appear in aligned sequences as in fact
* that is not a nucleotide. A base encoding using the typical gap representation such as '.' or '-' would
* be interpreted as an {@link #INVALID} (i.e. {@link #X}) call which is probably not what you want.
* So code to support those will need to do so outside this {@code enum}.
*
* @author Valentin Ruano-Rubio <[email protected]>
*/
public enum Nucleotide {
// Standard nucleotide codes,
// and their one-bit-encoding masks CODE(0bTGCA):
A(0b0001),
C(0b0010),
G(0b0100),
T(0b1000),
// Extended codes:
// CODE(included nucs)
R(A, G), // Purines.
Y(C, T), // Pyrimidines.
S(C, G), // Strong nucleotides.
W(A, T), // Weak nucleotides.
K(G, T), // Keto nucleotides.
M(A, C), // Amino nucleotides.
// The following 4 tri-nucleotide codes don't have a proper long name, they are simply "all-except-one"
// codes:
B(C, G, T), // Not-A (B follows A)
D(A, G, T), // Not-C (D follows C)
H(A, C, T), // Not-G (H follows G)
V(A, C, G), // Not-V (V follows T)
// Any
N(A, C, G, T), // Any/Unknown
// And X/invalid-call:
X();
// As far as this enum is concern,
// references to Uracil (U) are considered equivalent to Thymine (T) as they are transcription equivalent.
// nucleotides.
public static final Nucleotide U = T;
// Convenient constants with long form alternative names for some of the enumeration values:
// Long form standard nucleotide names.
public static final Nucleotide ADENINE = A;
public static final Nucleotide CYTOSINE = C;
public static final Nucleotide GUANINE = G;
public static final Nucleotide THYMINE = T;
public static final Nucleotide URACIL = U;
// Ambiguous nucleotide groups with proper long form names:
public static final Nucleotide STRONG = S;
public static final Nucleotide WEAK = W;
public static final Nucleotide PURINE = R;
public static final Nucleotide PYRIMIDINE = Y;
public static final Nucleotide AMINO = M;
public static final Nucleotide KETO = K;
public static final Nucleotide ANY = N;
public static final Nucleotide INVALID = X;
/**
* List of the standard (non-redundant) nucleotide values in their preferred alphabetical order.
*/
public static final List STANDARD_BASES = Collections.unmodifiableList(Arrays.asList(A, C, G, T));
// Since calling values() is costly (creates a new array every time) and often we do it just to find out the
// total number of constants is best to cache it in a constant.
private static final int NUMBER_OF_CONSTANTS;
/**
* Values indexed by their unsigned byte encodings. Non-valid encodings point to {@link #INVALID}.
*/
private static final Nucleotide[] baseToValue;
/**
* Values indexed by their mask.
*/
private static final Nucleotide[] maskToValue;
/**
* Value ordinal indexed by their unsigned byte ecodings. Non-valid encodings point to {@link #INVALID}
* (thru its ordinal).
*/
private static final int[] baseToOrdinal;
static {
final Nucleotide[] values = values();
NUMBER_OF_CONSTANTS = values.length;
baseToValue = new Nucleotide[1 << Byte.SIZE];
maskToValue = new Nucleotide[1 << STANDARD_BASES.size()];
baseToOrdinal = new int[1 << Byte.SIZE];
Arrays.fill(baseToValue, INVALID);
Arrays.fill(baseToOrdinal, INVALID.ordinal());
for (final Nucleotide nucleotide : values) {
// Notice that {@code "x & 0xFF"} is needed instead of {@code "(int)x" as
// we want the unsigned value (e.g. 255 rather than -1).
// This is repeated through this class code.
final int lowerCaseIndex = nucleotide.lowerCaseByteEncoding & 0xFF;
final int upperCaseIndex = nucleotide.upperCaseByteEncoding & 0xFF;
maskToValue[nucleotide.mask] = nucleotide;
baseToValue[lowerCaseIndex] = baseToValue[upperCaseIndex] = nucleotide;
baseToOrdinal[lowerCaseIndex] = baseToOrdinal[upperCaseIndex] = nucleotide.ordinal();
}
// need to do u and U here as they are just aliases to T.
baseToValue['u' & 0xFF] = baseToValue['U' & 0xFF] = U;
baseToOrdinal['u' & 0xFF] = baseToOrdinal['U' & 0xFF] = U.ordinal();
}
private final int mask;
private final boolean isStandard;
// Some properties initialized after construction as these depend on some static arrays
// defined above.
private Nucleotide complement;
private Nucleotide transition;
private Nucleotide transversion;
static {
for (final Nucleotide value : values()) {
value.finalizeInitialization();
}
}
/**
* Holds the lower-case byte encoding for this nucleotide.
* This is typically the lower-case version of the enum constant name.
*/
private final byte lowerCaseByteEncoding;
/**
* Holds the lower-case {@link String} representation for this nucleotide.
* This is typically the lower-case version of the enum constant name.
*/
private final String lowerCaseStringEncoding;
/**
* Holds the lower-case {@code char} representation for this nucleotide.
* This is typically the lower-case version of the enum constant name only character.
*/
private final char lowerCaseCharEncoding;
/**
* Holds the upper-case byte encoding for this nucleotide.
* This is typically the upper-case version of the enum constant name.
*/
private final byte upperCaseByteEncoding;
/**
* Holds the upper-case {@code char} representation for this nucleotide.
* This is typically the upper-case version of the enum constant name only character.
*/
private final char upperCaseCharEncoding;
/**
* Construct a nucleotide given its mask.
* @param mask the mask.
*/
Nucleotide(final int mask) {
this.mask = mask;
isStandard = Integer.bitCount(mask & 0b1111) == 1;
lowerCaseByteEncoding = (byte) Character.toLowerCase(name().charAt(0));
lowerCaseCharEncoding = Character.toLowerCase(name().charAt(0));
upperCaseByteEncoding = (byte) Character.toUpperCase(name().charAt(0));
upperCaseCharEncoding = Character.toUpperCase(name().charAt(0));
lowerCaseStringEncoding = name().toLowerCase();
}
/**
* Construct a nucleotide given the other codes that it would include.
* @param nucs the nucleotides to include.
*/
Nucleotide(final Nucleotide ... nucs) {
this(Arrays.stream(nucs).mapToInt(nuc -> nuc.mask).reduce((a, b) -> a | b).orElse(0));
}
/**
* Returns the {@code byte} typed encoding that corresponds to this nucleotide.
* @param upperCase whether to return the upper- or lower-case {@code byte} representation.
* @return a valid and exclusive {@code byte} representation for a nucleotide.
*/
public byte encodeAsByte(final boolean upperCase) {
return upperCase ? upperCaseByteEncoding : lowerCaseByteEncoding;
}
/**
* Returns the {@code char} typed encoding that corresponds to this nucleotide.
* @param upperCase whether to return the upper- or lower-case {@code char} representation.
* @return a valid and exclusive {@code char} representation for a nucleotide.
*/
public char encodeAsChar(final boolean upperCase) {
return upperCase ? upperCaseCharEncoding : lowerCaseCharEncoding;
}
/**
* Returns this nucleotide's exclusive upper-case {@code byte} encoding.
* @return ditto.
*/
public byte encodeAsByte() {
return upperCaseByteEncoding;
}
/**
* Returns the nucleotide's exclusive upper-case {@code char} encoding.
* @return ditto.
*/
public char encodeAsChar() {
return upperCaseCharEncoding;
}
/**
* Returns the nucleotide's exclusive upper-case {@code String} encoding.
* @return ditto.
*/
public String encodeAsString() {
return toString();
}
/**
* Returns the nucleotide's exclusive {@link String} typed encoding.
* @param upperCase whether the upper or lower-case representation should be returned.
* @return a valid and exclusive {@link String} representation for this nucleotide.
*/
public String encodeAsString(final boolean upperCase) {
return upperCase ? toString() : lowerCaseStringEncoding;
}
/**
* Returns the nucleotide that corresponds to a particular {@code byte} typed base code.
* @param base the query base code.
* @return never {@code null}, but {@link #INVALID} if the base code does not
* correspond to a valid nucleotide specification.
*/
public static Nucleotide decode(final byte base) {
return baseToValue[base & 0xFF];
}
/**
* Returns the nucleotide that corresponds to a particular {@code char} typed base code.
* @param ch the query base code.
* @return never {@code null}, but {@link #INVALID} if the base code does not correspond
* to a valid nucleotide specification.
*/
public static Nucleotide decode(final char ch) {
if ((ch & 0xFF00) != 0) {
return INVALID;
} else {
return baseToValue[ch & 0xFF];
}
}
/**
* Transform a single-letter character string into the corresponding nucleotide.
*
* {@code Null}, empty or multi-letter input will result in an {@link IllegalArgumentException}.
* These are not simply invalid encodings as the fact that are not a single character is
* an indication of a probable bug.
*
*
* @param seq the input character sequence to transform into.
* @return never {@code null}, perhaps {@link #INVALID} to indicate that the input is not a valid
* single letter encoding encoding.
*/
public static Nucleotide decode(final CharSequence seq) {
Utils.nonNull(seq, "the input character sequence must not be null");
if (seq.length() != 1) {
throw new IllegalArgumentException("the input character sequence must be exactly one character long");
} else {
return decode(seq.charAt(0));
}
}
/**
* Checks whether the nucleotide refers to a concrete (rather than ambiguous) base.
* @return {@code true} iff this is a concrete nucleotide.
*/
public boolean isStandard() {
return isStandard;
}
/**
* Checks whether the nucleotide refer to an ambiguous base.
* @return {@code true} iff this is an ambiguous nucleotide.
*/
public boolean isAmbiguous() {
return !isStandard && this != INVALID;
}
/**
* Whether this nucleotide code is valid or not.
* @return {@code true} iff valid.
*/
public boolean isValid() {
return this != INVALID;
}
/**
* Checks whether this nucleotide code encloses all possible nucleotides for another code.
* @param other the other nucleotide to compare to.
* @return {@code true} iff every nucleotide in {@code other} is enclosed in this code.
*/
public boolean includes(final Nucleotide other) {
Utils.nonNull(other);
return other != INVALID && (mask & other.mask) == other.mask;
}
/**
* Checks whether this nucleotide code encloses all possible nucleotides for another code.
* @param b the other nucleotide to compare to encoded as a byte.
* @return {@code true} iff every nucleotide in {@code other} is enclosed in this code.
*/
public boolean includes(final byte b) {
return includes(decode(b));
}
/**
* Returns the nucleotide code that include all and only the nucleotides that are
* included by this another code.
* @param other the other nucleotide code.
* @throws IllegalArgumentException if {@code other} is {@code null}.
* @return never {@code null}. Returns {@link #INVALID} if the intersection does not contain
* any nucleotide.
*/
public Nucleotide intersect(final Nucleotide other) {
Utils.nonNull(other, "the other nucleotide cannot be null");
return maskToValue[mask & other.mask];
}
/**
* Checks whether two nucleotides intersect given their byte encodings.
* @param a first nucleotide.
* @param b second nucleotide.
* @return {@code true} iff the input nucleotides intersect.
*/
public static boolean intersect(final byte a, final byte b) {
return (baseToValue[0xFF & a].mask & baseToValue[0xFF & b].mask) != 0;
}
/**
* Checks whether two base encodings make reference to the same {@link #Nucleotide}
* instance regardless of their case.
*
* This method is a shorthand for:
*
{@link #decode}(a){@link #same(Nucleotide) same}({@link #decode}(b))
.
*
*
*
* The order of the inputs is not relevant, therefore {@code same(a, b) == same(b, a)} for any
* given {@code a} and {@code b}.
*
*
* Notice that if either or both input bases make reference to an invalid nucleotide (i.e.
{@link #decode}(x) == {@link #INVALID}},
* this method will return {@code false} even if {@code a == b}.
*
* @param a the first base to compare (however order is not relevant).
* @param b the second base to compare (however order is not relevant).
* @return {@code true} iff {@code {@link #decode}}.same({@link #decode}(b))}}
*/
public static boolean same(final byte a, final byte b) {
return baseToValue[a & 0xFF] == baseToValue[b & 0xFF] && baseToValue[a & 0xFF] != INVALID;
}
/**
* Checks whether this and another {@link #Nucleotide} make reference to the same nucleotide(s).
*
* In contrast with {@link #equals}, this method will return {@code false} if any of the two, this
* or the input nucleotide is the {@link #INVALID} enum value. So even
{@link #INVALID}.same({@link #INVALID})
* will return {@code false}.
*
*
* @param other the other nucleotide.
* @return {@code true} iff this and the input nucleotide make reference to the same nucleotides.
*/
public boolean same(final Nucleotide other) {
return this == other && this != INVALID;
}
/**
* Returns the complement nucleotide code for this one.
*
* For ambiguous nucleotide codes, this will return the ambiguous code that encloses the complement of
* each possible nucleotide in this code.
*
*
* The complement of the {@link #INVALID} nucleotide is itself.
*
* @return never {@code null}.
*/
public Nucleotide complement() {
return complement;
}
/**
* Returns the complement for a base code.
*
* When an invalid base is provided this method will return the input byte (lower- or upper-cased depending on that
* flag value).
*
* @param b the input base
* @param upperCase whether to return the uppercase ({@code true}) or the lower case ({@code false}) byte encoding.
* @return the complement of the input.
*/
public static byte complement(final byte b, final boolean upperCase) {
final Nucleotide compl = baseToValue[b & 0xFF].complement;
return compl != INVALID
? (upperCase ? compl.upperCaseByteEncoding : compl.lowerCaseByteEncoding)
: (byte) ( upperCase ? Character.toUpperCase(b) : Character.toLowerCase(b));
}
/**
* Returns the complement for a base code.
*
* The case of the output will match the case of the input.
*
*
* When an invalid base is provided this method will return the input base byte.
*
* @param b the input base
* @return the complement of the input.
*/
public static byte complement(final byte b) {
final Nucleotide compl = baseToValue[b & 0xFF].complement;
return compl != INVALID
? (Character.isUpperCase(b) ? compl.upperCaseByteEncoding : compl.lowerCaseByteEncoding)
: b;
}
/**
* Returns the instance that would include all possible transition mutations from this one.
* @return never {@code null}.
*/
public Nucleotide transition() {
return transition;
}
/**
* Returns the instance that would include all possible tranversion mutations from nucleotides included
* in this one.
* @return never {@code null}.
*/
public Nucleotide transversion() {
return transversion;
}
/**
* Calculate and set the complement, transition and transversion using the #maskToValue array.
*/
private void finalizeInitialization() {
// set the complement.
final int complementMask = ((mask & A.mask) != 0 ? T.mask : 0)
| ((mask & T.mask) != 0 ? A.mask : 0)
| ((mask & C.mask) != 0 ? G.mask : 0)
| ((mask & G.mask) != 0 ? C.mask : 0);
complement = maskToValue[complementMask];
// set the transversion.
final int transversionMask = ((mask & PURINE.mask) != 0 ? PYRIMIDINE.mask : 0)
| ((mask & PYRIMIDINE.mask) != 0 ? PURINE.mask : 0);
transversion = maskToValue[transversionMask];
// set the transition.
final int transitionMask = ((mask & A.mask) != 0 ? G.mask : 0)
| ((mask & G.mask) != 0 ? A.mask : 0)
| ((mask & C.mask) != 0 ? T.mask : 0)
| ((mask & T.mask) != 0 ? C.mask : 0);
transition = maskToValue[transitionMask];
}
/**
* Transversion mutation toward a strong or a weak base.
*
* This method provides a non-ambiguous alternative to {@link #transversion()} for
* concrete nucleotides.
*
*
* @param strong whether the result should be a strong ({@code S: G, C}) or weak ({@code W: A, T}) nucleotide(s).
* @return nucleotides that may emerged from such a transversion.
*/
public Nucleotide transversion(final boolean strong) {
return transversion.intersect(strong ? STRONG : WEAK);
}
/**
* Helper class to count the number of occurrences of each nucleotide code in
* a sequence.
*/
public static final class Counter {
private final long[] counts;
/**
* Creates a new counter with all counts set to 0.
*/
public Counter() {
counts = new long[NUMBER_OF_CONSTANTS];
}
/**
* Increases by 1 the count for a nucleotide.
* @param nucleotide the target nucleotide.
* @throws IllegalArgumentException if nucleotide is {@code null}.
*/
public void add(final Nucleotide nucleotide) {
counts[Utils.nonNull(nucleotide).ordinal()]++;
}
/**
* Increases the nucleotide that corresponds to the input base own count by 1.
* @param base the base code.
* @throws IllegalArgumentException if {@code base} is {@code negative}.
*/
public void add(final byte base) {
counts[baseToOrdinal[base & 0xFF]]++;
}
public void add(final char base) {
if ((base & 0xFF00) != 0) {
counts[INVALID.ordinal()]++;
} else {
counts[baseToOrdinal[base & 0xFF]]++;
}
}
/**
* Returns the current count for a given nucleotide.
* @param nucleotide the query nucleotide.
* @throws IllegalArgumentException if {@code nucleotide} is {@code null}.
* @return 0 or greater.
*/
public long get(final Nucleotide nucleotide) {
return counts[Utils.nonNull(nucleotide).ordinal()];
}
/**
* Increase by one the count for a nucleotide for each
* occurrence of such in the input byte array base codes.
* @param bases the input base codes.
* @throws IllegalArgumentException if {@code bases} is null.
*/
public final void addAll(final byte ... bases) {
Utils.nonNull(bases);
for (final byte base : bases) {
counts[baseToOrdinal[base & 0xFF]]++;
}
}
/**
* Increase by one the count for a nucleotide for each
* occurrence of such in the input char array base codes.
* @param bases the input base codes.
* @throws IllegalArgumentException if {@code bases} is null.
*/
public final void addAll(final char ... bases) {
Utils.nonNull(bases);
for (final char base : bases) {
if ((base & 0xFF00) != 0) {
counts[INVALID.ordinal()]++;
} else {
counts[baseToOrdinal[base & 0xFF]]++;
}
}
}
/**
* Increase by one the count for a nucleotide for each
* occurrence of such in the input {@link CharSequence}.
* @param bases the input bases sequence.
* @throws IllegalArgumentException if the input is {@code null}.
*/
public final void addAll(final CharSequence bases) {
Utils.nonNull(bases);
for (int i = 0; i < bases.length(); i++) {
final char base = bases.charAt(i);
if ((base & 0xFF00) != 0) {
counts[INVALID.ordinal()]++;
} else {
counts[baseToOrdinal[base & 0xFF]]++;
}
}
}
/**
* Reset all the counts to 0.
*/
public void clear() {
Arrays.fill(counts, 0);
}
/**
* Return the total count of all nucleotide constants.
* @return 0 or greater.
*/
public long sum() {
return MathUtils.sum(counts);
}
}
}