All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.aaproperties.PeptideProperties Maven / Gradle / Ivy

/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.aaproperties;

import org.biojava.nbio.aaproperties.xml.AminoAcidCompositionTable;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.bind.JAXBException;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * This is an adaptor class which enable the ease of generating protein properties.
 * At least one adaptor method is written for each available properties provided in IPeptideProperties.
 *
 * @author kohchuanhock
 * @version 2011.08.22
 * @since 3.0.2
 * @see IPeptideProperties
 * @see PeptidePropertiesImpl
 */
public class PeptideProperties {

	private final static Logger logger = LoggerFactory.getLogger(PeptideProperties.class);

	/**
	 * Enumeration of 20 standard amino acid code
	 */
	public enum SingleLetterAACode { W, C, M, H, Y, F, Q, N, I, R, D, P, T, K, E, V, S, G, A, L}

	/**
	 * Contains the 20 standard AA code in a set
	 */
	public static Set standardAASet;

	/**
	 * To initialize the standardAASet
	 */
	static{
		standardAASet = new HashSet();
		for(SingleLetterAACode c:SingleLetterAACode.values()) standardAASet.add(c.toString().charAt(0));
	}

	/**
	 * An adaptor method to return the molecular weight of sequence.
	 * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
	 * This method will sum the molecular weight of each amino acid in the
	 * sequence. Molecular weights are based on here.
	 *
	 * @param sequence
	 * 	a protein sequence consisting of non-ambiguous characters only
	 * @return the total molecular weight of sequence + weight of water molecule
	 */
	public static final double getMolecularWeight(String sequence){
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getMolecularWeight(pSequence);
	}

	/**
	 * An adaptor method to return the molecular weight of sequence.
	 * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
	 * This method will sum the molecular weight of each amino acid in the
	 * sequence. Molecular weights are based on the input xml file.
	 *
	 * @param sequence
	 * 	a protein sequence consisting of non-ambiguous characters only
	 * @param elementMassFile
	 * 	xml file that details the mass of each elements and isotopes
	 * @param aminoAcidCompositionFile
	 * 	xml file that details the composition of amino acids
	 * @return the total molecular weight of sequence + weight of water molecule
	 * @throws FileNotFoundException
	 * 	thrown if either elementMassFile or aminoAcidCompositionFile are not found
	 * @throws JAXBException
	 * 	thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
	 */
	public static final double getMolecularWeight(String sequence, File elementMassFile, File aminoAcidCompositionFile)
	throws FileNotFoundException, JAXBException{
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getMolecularWeight(pSequence, elementMassFile, aminoAcidCompositionFile);
	}

	/**
	 * An adaptor method to return the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
	 * This method will sum the molecular weight of each amino acid in the
	 * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema.
	 * Note that it assumes that ElementMass.xml file can be found in default location.
	 *
	 * @param sequence
	 * 	a protein sequence consisting of non-ambiguous characters only
	 * 	xml file that details the mass of each elements and isotopes
	 * @param aminoAcidCompositionFile
	 * 	xml file that details the composition of amino acids
	 * @return the total molecular weight of sequence + weight of water molecule
	 * @throws JAXBException
	 * 	thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
	 * @throws FileNotFoundException
	 * 	thrown if either elementMassFile or aminoAcidCompositionFile are not found
	 */
	public static final double getMolecularWeight(String sequence, File aminoAcidCompositionFile) throws FileNotFoundException, JAXBException{
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getMolecularWeight(pSequence, aminoAcidCompositionFile);
	}

	/**
	 * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
	 * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
	 * Note that ElementMass.xml is assumed to be able to be seen in default location.
	 *
	 * @param aminoAcidCompositionFile
	 * 	xml file that details the composition of amino acids
	 * @return the initialized amino acid composition table
	 * @throws JAXBException
	 * 	thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
	 * @throws FileNotFoundException
	 * 	thrown if either elementMassFile or aminoAcidCompositionFile are not found
	 */
	public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File aminoAcidCompositionFile)
	throws JAXBException, FileNotFoundException{
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.obtainAminoAcidCompositionTable(aminoAcidCompositionFile);
	}

	/**
	 * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
	 * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
	 *
	 * @param elementMassFile
	 * 	xml file that details the mass of each elements and isotopes
	 * @param aminoAcidCompositionFile
	 * 	xml file that details the composition of amino acids
	 * @return the initialized amino acid composition table
	 * @throws JAXBException
	 * 	thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
	 * @throws FileNotFoundException
	 * 	thrown if either elementMassFile or aminoAcidCompositionFile are not found
	 */
	public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File elementMassFile, File aminoAcidCompositionFile)
	throws JAXBException, FileNotFoundException{
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.obtainAminoAcidCompositionTable(elementMassFile, aminoAcidCompositionFile);
	}

	/**
	 * An adaptor method that returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
	 * This method will sum the molecular weight of each amino acid in the
	 * sequence. Molecular weights are based on the AminoAcidCompositionTable.
	 * Those input files must be XML using the defined schema.
	 *
	 * @param sequence
	 * 	a protein sequence consisting of non-ambiguous characters only
	 * @param aminoAcidCompositionTable
	 * 	a amino acid composition table obtained by calling IPeptideProperties.obtainAminoAcidCompositionTable
	 * @return the total molecular weight of sequence + weight of water molecule
	 * 	thrown if the method IPeptideProperties.setMolecularWeightXML(File, File) is not successfully called before calling this method.
	 */
	public static double getMolecularWeightBasedOnXML(String sequence, AminoAcidCompositionTable aminoAcidCompositionTable){
		sequence = Utils.checkSequence(sequence, aminoAcidCompositionTable.getSymbolSet());
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence, aminoAcidCompositionTable.getAminoAcidCompoundSet());
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getMolecularWeightBasedOnXML(pSequence, aminoAcidCompositionTable);
	}

	/**
	 * An adaptor method to returns the absorbance (optical density) of sequence. The sequence argument
	 * must be a protein sequence consisting of only non-ambiguous characters.
	 * The computation of absorbance (optical density) follows the
	 * documentation in here.
	 *
	 * @param sequence
	 * 	a protein sequence consisting of non-ambiguous characters only
	 * @param assumeCysReduced
	 * 	true if Cys are assumed to be reduced and false if Cys are assumed to form cystines
	 * @return the absorbance (optical density) of sequence
	 */
	public static final double getAbsorbance(String sequence, boolean assumeCysReduced){
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getAbsorbance(pSequence, assumeCysReduced);
	}

	/**
	 * An adaptor method to return the extinction coefficient of sequence. The sequence argument
	 * must be a protein sequence consisting of only non-ambiguous characters.
	 * The extinction coefficient indicates how much light a protein absorbs at
	 * a certain wavelength. It is useful to have an estimation of this
	 * coefficient for following a protein which a spectrophotometer when
	 * purifying it. The computation of extinction coefficient follows the
	 * documentation in here.
	 *
	 * @param sequence
	 *            a protein sequence consisting of non-ambiguous characters only
	 * @param assumeCysReduced
	 *            true if Cys are assumed to be reduced and false if Cys are
	 *            assumed to form cystines
	 * @return the extinction coefficient of sequence
	 */
	public static final double getExtinctionCoefficient(String sequence, boolean assumeCysReduced) {
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getExtinctionCoefficient(pSequence, assumeCysReduced);
	}

	/**
	 * An adaptor method to return the instability index of sequence. The sequence argument must be
	 * a protein sequence consisting of only non-ambiguous characters.
	 * The instability index provides an estimate of the stability of your
	 * protein in a test tube. The computation of instability index follows the
	 * documentation in here.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @return the instability index of sequence
	 */
	public static final double getInstabilityIndex(String sequence) {
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getInstabilityIndex(pSequence);
	}

	/**
	 * An adaptor method to return the apliphatic index of sequence. The sequence argument must be a
	 * protein sequence consisting of only non-ambiguous characters.
	 * The aliphatic index of a protein is defined as the relative volume
	 * occupied by aliphatic side chains (alanine, valine, isoleucine, and
	 * leucine). It may be regarded as a positive factor for the increase of
	 * thermostability of globular proteins. The computation of aliphatic index
	 * follows the documentation in here.
	 * A protein whose instability index is smaller than 40 is predicted as stable, a value above 40 predicts that the protein may be unstable.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @return the aliphatic index of sequence
	 */
	public static final double getApliphaticIndex(String sequence) {
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}

		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getApliphaticIndex(pSequence);
	}

	/**
	 * An adaptor method to return the average hydropathy value of sequence. The sequence argument
	 * must be a protein sequence consisting of only non-ambiguous characters.
	 * The average value for a sequence is calculated as the sum of hydropathy
	 * values of all the amino acids, divided by the number of residues in the
	 * sequence. Hydropathy values are based on (Kyte, J. and Doolittle, R.F.
	 * (1982) A simple method for displaying the hydropathic character of a
	 * protein. J. Mol. Biol. 157, 105-132).
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @return the average hydropathy value of sequence
	 */
	public static final double getAvgHydropathy(String sequence) {
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getAvgHydropathy(pSequence);
	}

	/**
	 * An adaptor method to return the isoelectric point of sequence. The sequence argument must be
	 * a protein sequence consisting of only non-ambiguous characters.
	 * The isoelectric point is the pH at which the protein carries no net
	 * electrical charge. The isoelectric point will be computed based on
	 * approach stated in
	 * here
	 *
	 * pKa values used will be either
	 * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
	 * OR
	 * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @param useExpasyValues
	 * 		whether to use Expasy values (Default) or Innovagen values
	 * @return the isoelectric point of sequence
	 */
	public static final double getIsoelectricPoint(String sequence, boolean useExpasyValues) {
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getIsoelectricPoint(pSequence, useExpasyValues);
	}

	public static final double getIsoelectricPoint(String sequence){
		return getIsoelectricPoint(sequence, true);
	}

	/**
	 * An adaptor method to return the net charge of sequence at pH 7. The sequence argument must be
	 * a protein sequence consisting of only non-ambiguous characters.
	 * The net charge will be computed using the approach stated in
	 * here
	 *
	 * pKa values used will be either
	 * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
	 * OR
	 * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @param useExpasyValues
	 * 		whether to use Expasy values (Default) or Innovagen values
	 * @param pHPoint
	 * 		the pH value to use for computation of the net charge. Default at 7.
	 * @return the net charge of sequence at given pHPoint
	 */
	public static final double getNetCharge(String sequence, boolean useExpasyValues, double pHPoint){
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getNetCharge(pSequence, useExpasyValues, pHPoint);
	}

	public static final double getNetCharge(String sequence, boolean useExpasyValues) {
		return getNetCharge(sequence, useExpasyValues, 7.0);
	}

	public static final double getNetCharge(String sequence){
		return getNetCharge(sequence, true);
	}

	/**
	 * An adaptor method to return the composition of specified amino acid in the sequence. The
	 * sequence argument must be a protein sequence consisting of only
	 * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
	 * character.
	 * The composition of an amino acid is the total number of its occurrence,
	 * divided by the total length of the sequence.
	 *
	 * @param sequence
	 *            a protein sequence consisting of non-ambiguous characters only
	 * @param aminoAcidCode
	 *            the code of the amino acid to compute
	 * @return the composition of specified amino acid in the sequence
	 * @see SingleLetterAACode
	 */
	public static final double getEnrichment(String sequence, SingleLetterAACode aminoAcidCode) {
		return getEnrichment(sequence, aminoAcidCode.toString());
	}

	/**
	 * An adaptor method to return the composition of specified amino acid in the sequence. The
	 * sequence argument must be a protein sequence consisting of only
	 * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
	 * character.
	 * The composition of an amino acid is the total number of its occurrence,
	 * divided by the total length of the sequence.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @param aminoAcidCode
	 * 		the code of the amino acid to compute
	 * @return the composition of specified amino acid in the sequence
	 */
	public static final double getEnrichment(String sequence, char aminoAcidCode){
		return getEnrichment(sequence, aminoAcidCode + "");
	}

	/**
	 * An adaptor method to return the composition of specified amino acid in the sequence. The
	 * sequence argument must be a protein sequence consisting of only
	 * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
	 * character.
	 * The composition of an amino acid is the total number of its occurrence,
	 * divided by the total length of the sequence.
	 *
	 * @param sequence
	 *            a protein sequence consisting of non-ambiguous characters only
	 * @param aminoAcidCode
	 *            the code of the amino acid to compute
	 * @return the composition of specified amino acid in the sequence
	 */
	public static final double getEnrichment(String sequence, String aminoAcidCode){
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		AminoAcidCompoundSet aaSet = new AminoAcidCompoundSet();
		return pp.getEnrichment(pSequence, aaSet.getCompoundForString(aminoAcidCode));
	}

	/**
	 * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
	 * The sequence argument must be a protein sequence consisting of only
	 * non-ambiguous characters.
	 * The composition of an amino acid is the total number of its occurrence,
	 * divided by the total length of the sequence.
	 *
	 * @param sequence
	 *            a protein sequence consisting of non-ambiguous characters only
	 * @return the composition of the 20 standard amino acid in the sequence
	 * @see AminoAcidCompound
	 */
	public static final Map getAAComposition(String sequence) {
		sequence = Utils.checkSequence(sequence);
		ProteinSequence pSequence = null;
		try {
			pSequence = new ProteinSequence(sequence);
		} catch (CompoundNotFoundException e) {
			// the sequence was checked with Utils.checkSequence, this shouldn't happen
			logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
		}
		IPeptideProperties pp = new PeptidePropertiesImpl();
		return pp.getAAComposition(pSequence);
	}

	/**
	 * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
	 * The sequence argument must be a protein sequence consisting of only
	 * non-ambiguous characters.
	 * The composition of an amino acid is the total number of its occurrence,
	 * divided by the total length of the sequence.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @return the composition of the 20 standard amino acid in the sequence
	 */
	public static final Map getAACompositionString(String sequence){
		Map aa2Composition = getAAComposition(sequence);
		Map aaString2Composition = new HashMap();
		for(AminoAcidCompound aaCompound:aa2Composition.keySet()){
			aaString2Composition.put(aaCompound.getShortName(), aa2Composition.get(aaCompound));
		}
		return aaString2Composition;
	}

	/**
	 * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
	 * The sequence argument must be a protein sequence consisting of only
	 * non-ambiguous characters.
	 * The composition of an amino acid is the total number of its occurrence,
	 * divided by the total length of the sequence.
	 *
	 * @param sequence
	 * 		a protein sequence consisting of non-ambiguous characters only
	 * @return the composition of the 20 standard amino acid in the sequence
	 */
	public static final Map getAACompositionChar(String sequence){
		Map aa2Composition = getAAComposition(sequence);
		Map aaChar2Composition = new HashMap();
		for(AminoAcidCompound aaCompound:aa2Composition.keySet()){
			aaChar2Composition.put(aaCompound.getShortName().charAt(0), aa2Composition.get(aaCompound));
		}
		return aaChar2Composition;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy