org.biojava.nbio.aaproperties.Utils Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.aaproperties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.CharBuffer;
import java.util.HashSet;
import java.util.Set;
/**
* This is a utility class that contains utility methods which will facilitates the coding of other methods
*
* @author kohchuanhock
* @version 2011.08.22
* @since 3.0.2
*/
public class Utils {
private final static Logger logger = LoggerFactory.getLogger(Utils.class);
/**
* Returns a value with the desired number of decimal places.
*
* @param d
* value to round
* @param c
* number of decimal places desired.
* Must be greater or equal to zero, otherwise, the given value d would be returned without any modification.
* @return
* a value with the given number of decimal places.
*/
public final static double roundToDecimals(double d, int c) {
if(c < 0) return d;
double p = Math.pow(10,c);
d = d * p;
double tmp = Math.round(d);
return tmp/p;
}
/**
* Checks if given sequence contains invalid characters. Returns true if invalid characters are found, else return false.
* Note that any characters are deemed as valid only if it is found in cSet.
*
* @param sequence
* protein sequence to be check.
* @param cSet
* the set of characters that are deemed valid.
* @return
* true if invalid characters are found, else return false.
*/
public final static boolean doesSequenceContainInvalidChar(String sequence, Set cSet){
for(char c:sequence.toCharArray()){
if(!cSet.contains(c)) return true;
}
return false;
}
/**
* Return the number of invalid characters in sequence.
*
* @param sequence
* protein sequence to count for invalid characters.
* @param cSet
* the set of characters that are deemed valid.
* @param ignoreCase
* indicates if cases should be ignored
* @return
* the number of invalid characters in sequence.
*/
public final static int getNumberOfInvalidChar(String sequence, Set cSet, boolean ignoreCase){
char[] cArray = ignoreCase ? sequence.toUpperCase().toCharArray(): sequence.toCharArray();
final Set characterSet = cSet == null ?PeptideProperties.standardAASet: cSet ;
int total = (int)CharBuffer.wrap(cArray).chars().filter(character -> !characterSet.contains((char)character)).count();
return total;
}
/**
* Returns a new sequence with all invalid characters being replaced by '-'.
* Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
*
* @param sequence
* protein sequence to be clean
* @param cSet
* user defined characters that are valid. Can be null. If null, then 20 standard protein amino acid codes will be considered as valid.
* @return
* a new sequence with all invalid characters being replaced by '-'.
*/
public final static String cleanSequence(String sequence, Set cSet){
Set invalidCharSet = new HashSet<>();
StringBuilder cleanSeq = new StringBuilder();
if(cSet == null) cSet = PeptideProperties.standardAASet;
for(char c:sequence.toCharArray()){
if(!cSet.contains(c)){
cleanSeq.append("-");
invalidCharSet.add(c);
}else{
cleanSeq.append(c);
}
}
// TODO: Should be StringJoiner once JDK8 used
StringBuilder stringBuilder = new StringBuilder();
for(char c: invalidCharSet){
stringBuilder.append("\'" + c + "\'");
}
stringBuilder.deleteCharAt(stringBuilder.length()-1);
stringBuilder.append(" are being replaced with '-'");
logger.warn(stringBuilder.toString());
return cleanSeq.toString();
}
/**
* Checks if the sequence contains invalid characters.
* Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
* If yes, it will return a new sequence where invalid characters are replaced with '-'.
* If no, it will simply return the input sequence.
*
* @param sequence
* protein sequence to be check for invalid characters.
* @return
* a sequence with no invalid characters.
*/
public static final String checkSequence(String sequence){
return checkSequence(sequence, null);
}
/**
* Checks if the sequence contains invalid characters.
* Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
* If yes, it will return a new sequence where invalid characters are replaced with '-'.
* If no, it will simply return the input sequence.
*
* @param sequence
* protein sequence to be check for invalid characters.
* @param cSet
* character set which define the valid characters.
* @return
* a sequence with no invalid characters.
*/
public static final String checkSequence(String sequence, Set cSet){
boolean containInvalid = false;
if(cSet != null){
containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, cSet);
}else{
containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, PeptideProperties.standardAASet);
}
if(containInvalid){
String cSeq = cleanSequence(sequence, cSet);
logger.warn("There exists invalid characters in the sequence. Computed results might not be precise.");
logger.warn("To remove this warning: Please use org.biojava.nbio.aaproperties.Utils.cleanSequence to clean sequence.");
return cSeq;
}
else{
return sequence;
}
}
}