picard.sam.util.ReadNameParser Maven / Gradle / Ivy
package picard.sam.util;
import htsjdk.samtools.util.Log;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
*/
public class ReadNameParser {
/**
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
* ignoring any trailing non-digit characters.
*
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
* Illumina technology.
*/
public static final String DEFAULT_READ_NAME_REGEX = "".intern();
private final int[] tmpLocationFields = new int[3]; // for optimization of addLocationInformation
private String readNameRegex = null;
private Pattern readNamePattern;
private boolean warnedAboutRegexNotMatching = false;
private final Log log;
/**
* Creates are read name parser using the default read name regex and optical duplicate distance. See {@link #DEFAULT_READ_NAME_REGEX}
* for an explanation on how the read name is parsed.
*/
public ReadNameParser() {
this(DEFAULT_READ_NAME_REGEX);
}
/**
* Creates are read name parser using the given read name regex. See {@link #DEFAULT_READ_NAME_REGEX} for an explanation on how to
* format the regular expression (regex) string.
* @param readNameRegex the read name regular expression string to parse read names, null to never parse location information.
*/
public ReadNameParser(final String readNameRegex) {
this(readNameRegex, null);
}
/**
* Creates are read name parser using the given read name regex. See {@link #DEFAULT_READ_NAME_REGEX} for an explanation on how to
* format the regular expression (regex) string.
* @param readNameRegex the read name regular expression string to parse read names, null to never parse location information..
* @param log the log to which to write messages.
*/
public ReadNameParser(final String readNameRegex, final Log log) {
this.readNameRegex = readNameRegex;
this.log = log;
}
/**
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
* can be used later to determine optical duplication
*
* @param readName the name of the read/cluster
* @param loc the object to add tile/x/y to
* @return true if the read name contained the information in parsable form, false otherwise
*/
public boolean addLocationInformation(final String readName, final PhysicalLocation loc) {
try {
// Optimized version if using the default read name regex (== used on purpose):
if (this.readNameRegex == ReadNameParser.DEFAULT_READ_NAME_REGEX) {
final int fields = getLastThreeFields(readName, ':', tmpLocationFields);
if (!(fields == 5 || fields == 7)) {
if (null != log && !this.warnedAboutRegexNotMatching) {
this.log.warn(String.format("Default READ_NAME_REGEX '%s' did not match read name '%s'. " +
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. " +
"Note that this message will not be emitted again even if other read names do not match the regex.",
this.readNameRegex, readName));
this.warnedAboutRegexNotMatching = true;
}
return false;
}
loc.setTile((short) tmpLocationFields[0]);
loc.setX(tmpLocationFields[1]);
loc.setY(tmpLocationFields[2]);
return true;
} else if (this.readNameRegex == null) {
return false;
} else {
// Standard version that will use the regex
if (this.readNamePattern == null) this.readNamePattern = Pattern.compile(this.readNameRegex);
final Matcher m = this.readNamePattern.matcher(readName);
if (m.matches()) {
loc.setTile((short) Integer.parseInt(m.group(1)));
loc.setX(Integer.parseInt(m.group(2)));
loc.setY(Integer.parseInt(m.group(3)));
return true;
} else {
if (null != log && !this.warnedAboutRegexNotMatching) {
this.log.warn(String.format("READ_NAME_REGEX '%s' did not match read name '%s'. Your regex may not be correct. " +
"Note that this message will not be emitted again even if other read names do not match the regex.",
this.readNameRegex, readName));
warnedAboutRegexNotMatching = true;
}
return false;
}
}
}
catch (NumberFormatException nfe) {
if (log != null && !this.warnedAboutRegexNotMatching) {
this.log.warn("A field field parsed out of a read name was expected to contain an integer and did not. ",
"Read name: ", readName, ". Cause: ", nfe.getMessage());
warnedAboutRegexNotMatching = true;
}
return false;
}
}
/**
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
*
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
*/
public static int getLastThreeFields(final String readName, final char delim, final int[] tokens) throws NumberFormatException {
int tokensIdx = 2; // start at the last token
int numFields = 0;
int i, endIdx;
endIdx = readName.length();
// find the last three tokens only
for (i = readName.length() - 1; 0 <= i && 0 <= tokensIdx; i--) {
if (readName.charAt(i) == delim || 0 == i) {
numFields++;
tokens[tokensIdx] = rapidParseInt(readName.substring((0 == i) ? 0 : (i+1), endIdx));
tokensIdx--;
endIdx = i;
}
}
// continue to find the # of fields
while (0 <= i) {
if (readName.charAt(i) == delim || 0 == i) numFields++;
i--;
}
if (numFields < 3) {
tokens[0] = tokens[1] = tokens[2] = -1;
return -1;
}
else {
return numFields;
}
}
/**
* Very specialized method to rapidly parse a sequence of digits from a String up until the first
* non-digit character.
*
* @throws NumberFormatException if the String does not start with an optional - followed by at least on digit
*/
public static int rapidParseInt(final String input) throws NumberFormatException {
final int len = input.length();
int val = 0;
int i = 0;
boolean isNegative = false;
if (0 < len && '-' == input.charAt(0)) {
i = 1;
isNegative = true;
}
boolean hasDigits = false;
for (; i < len; ++i) {
final char ch = input.charAt(i);
if (Character.isDigit(ch)) {
val = (val * 10) + (ch - 48);
hasDigits = true;
} else {
break;
}
}
if (!hasDigits) throw new NumberFormatException("String '" + input + "' did not start with a parsable number.");
if (isNegative) val = -val;
return val;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy