All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.translation.CodonTable Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.translation;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
import com.hfg.exception.ProgrammingException;
import com.hfg.math.Range;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.OrderedSet;
import com.hfg.util.StringUtil;
import com.hfg.util.io.StreamUtil;
import com.hfg.xml.HfgXML;
import com.hfg.xml.XMLName;
import com.hfg.xml.XMLTag;

//------------------------------------------------------------------------------
/**
 Table that maps codons to their respective amino acids.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class CodonTable implements TranslationTable { // XML Tag names public static final XMLName XML_CODON_TABLE = new XMLName("CodonTable", HfgXML.HFG_NAMESPACE); // XML Attribute names public static final XMLName XML_NAME_ATT = new XMLName("name", HfgXML.HFG_NAMESPACE); public static final XMLName XML_SPECIES_ATT = new XMLName("species", HfgXML.HFG_NAMESPACE); public static final CodonTable HUMAN = new CodonTable("codondata/human.cod.xml.gz"); public static final CodonTable MOUSE = new CodonTable("codondata/mouse.cod.xml.gz"); private static final String sKazusaUrlTemplate = "http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=%d&aa=1&style=N"; private static final Pattern sKazusaLinePattern = Pattern.compile("\\s*(\\w{3})\\s+([\\w\\*])\\s+([\\.\\d]+)\\s+([\\.\\d]+)\\s+\\(\\s*(\\d+)\\)"); private static Random sRandom = new Random(System.currentTimeMillis()); private String mName; private NCBITaxon mTaxon; private Set mCodons = new OrderedSet<>(65); private Map mCodonToAAMap = new HashMap<>(65); private Map> mAAToCodonsMap = new HashMap<>(26); //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- public CodonTable(NCBITaxon inSpecies) { mTaxon = inSpecies; } //--------------------------------------------------------------------------- public CodonTable(XMLTag inXMLTag) { fromXMLTag(inXMLTag); init(); } //--------------------------------------------------------------------------- protected CodonTable(String inRsrcPath) { InputStream stream = getClass().getResourceAsStream(inRsrcPath); if (null == stream) { throw new ProgrammingException("The rsrc " + inRsrcPath + " couldn't be found!?"); } try { if (inRsrcPath.endsWith(".gz")) { stream = new GZIPInputStream(stream); } fromXMLTag(new XMLTag(stream)); StreamUtil.close(stream); } catch (Exception e) { throw new ProgrammingException(e); } init(); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public String name() { return mName; } //--------------------------------------------------------------------------- private CodonTable setName(String inValue) { mName = inValue; return this; } //--------------------------------------------------------------------------- /** Dynamically retrieves codon usage data from the Codon Usage Database (http://www.kazusa.or.jp/codon/) @throws Exception if the data cannot be retrieved */ public void retrieveDataFromKazusa() throws Exception { URL url = new URL(String.format(sKazusaUrlTemplate, mTaxon.getTaxonId())); URLConnection connection = url.openConnection(); parseKazusaData(connection.getInputStream()); } //--------------------------------------------------------------------------- public XMLTag toXMLTag() { XMLTag tag = new XMLTag(XML_CODON_TABLE); if (name() != null) { tag.setAttribute(XML_NAME_ATT, name()); } if (mTaxon != null) { tag.setAttribute(XML_SPECIES_ATT, mTaxon.getScientificName()); } if (mCodons != null) { for (Codon codon : mCodons) { tag.addSubtag(codon.toXMLTag()); } } return tag; } //--------------------------------------------------------------------------- public CodonTable addCodon(Codon inValue) { mCodons.add(inValue); mCodonToAAMap.put(inValue, inValue.getAA()); Set codons = mAAToCodonsMap.get(inValue.getAA()); if (null == codons) { codons = new HashSet<>(10); mAAToCodonsMap.put(inValue.getAA(), codons); } codons.add(inValue); return this; } //--------------------------------------------------------------------------- /** Can be used to retrieve the internal Codon object which can contain the AA mapping and codon usage data. @param inCodon the string representation of the Codon object to retrieve @return the Codon object corresponding to the specified nucleotide triplet */ public Codon getCodon(String inCodon) { Codon queryCodon = new Codon(inCodon); Codon requestedCodon = null; for (Codon codon : mCodons) { if (codon.equals(queryCodon)) { requestedCodon = codon; break; } } return requestedCodon; } //-------------------------------------------------------------------------- public Set getCodons() { return Collections.unmodifiableSet(mCodons); } //--------------------------------------------------------------------------- public Set getCodonsForAA(char inAA) { return mAAToCodonsMap.get(inAA); } //--------------------------------------------------------------------------- /** Returns a weighted randomly selected codon for the specified amino acid. @param inAA the amino acid character for which codons should be selected @return the randomly selected Codon */ public Codon getCodonForAA_viaWeightedSelection(char inAA) { return getCodonForAA_viaWeightedSelection(inAA, 0); } //--------------------------------------------------------------------------- /** Returns a weighted randomly selected codon meeting a specified minimum usage bias for the specified amino acid. @param inAA the amino acid character for which codons should be selected @param inMinBias the minimum usage bias (frequency) for codons to be considered @return the randomly selected Codon */ public Codon getCodonForAA_viaWeightedSelection(char inAA, float inMinBias) { // Force to uppercase char aa = Character.toUpperCase(inAA); float totalWeight = 0.0f; Set codons = mAAToCodonsMap.get(aa); if (! CollectionUtil.hasValues(codons)) { throw new RuntimeException("No codons specified for amino acid " + StringUtil.singleQuote(inAA) + "!"); } for (Codon codon : codons) { if (codon.getCodonUsage().getBias() >= inMinBias) { totalWeight += codon.getCodonUsage().getBias(); } } float randomNum = sRandom.nextFloat(); Codon selectedCodon = null; Range valueRange = new Range().setStart(0f); for (Codon codon : codons) { if (codon.getCodonUsage().getBias() >= inMinBias) { valueRange.setEnd(valueRange.getStart() + (codon.getCodonUsage().getBias() / totalWeight)); if (valueRange.contains(randomNum)) { selectedCodon = codon; break; } valueRange.setStart(valueRange.getEnd()); } } return selectedCodon; } //--------------------------------------------------------------------------- public char translateCodon(String inCodon) { return translateCodon(new Codon(inCodon)); } //--------------------------------------------------------------------------- public char translateCodon(Codon inCodon) { Character aa = mCodonToAAMap.get(inCodon); return (aa != null ? aa : 'X'); } //--------------------------------------------------------------------------- public boolean containsCodonUsageData() { boolean result = false; if (mAAToCodonsMap.size() > 0) { result = (mAAToCodonsMap.values().iterator().next().iterator().next().getCodonUsage() != null); } return result; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- // Calculate degenerate codons private void init() { for (Character aa : mAAToCodonsMap.keySet()) { Set codonSet = mAAToCodonsMap.get(aa); Map>> codonNucleotideMap = new HashMap<>(2); for (Codon codon : codonSet) { Nucleotide pos1Nuc = Nucleotide.valueOf(codon.toString().charAt(0)); Nucleotide pos2Nuc = Nucleotide.valueOf(codon.toString().charAt(1)); Nucleotide pos3Nuc = Nucleotide.valueOf(codon.toString().charAt(2)); Map> pos2Map = codonNucleotideMap.get(pos1Nuc); if (null == pos2Map) { pos2Map = new HashMap<>(3); codonNucleotideMap.put(pos1Nuc, pos2Map); } Set pos3Set = pos2Map.get(pos2Nuc); if (null == pos3Set) { pos3Set = new HashSet<>(4); pos2Map.put(pos2Nuc, pos3Set); } pos3Set.add(pos3Nuc); } for (Nucleotide pos1Nuc : codonNucleotideMap.keySet()) { Map> pos2Map = codonNucleotideMap.get(pos1Nuc); for (Nucleotide pos2Nuc : pos2Map.keySet()) { Set pos3Set = pos2Map.get(pos2Nuc); for (Nucleotide degenerateNuc : Nucleotide.degenerateValues()) { boolean matches = true; for (Nucleotide base : degenerateNuc.getDegeneracy()) { if (!pos3Set.contains(base)) { matches = false; break; } } if (matches) { Codon degenerateCodon = new Codon(("" + pos1Nuc.getOneLetterCode() + pos2Nuc.getOneLetterCode() + degenerateNuc.getOneLetterCode()).toUpperCase()); // Not calling addCodon() because we don't want to add degenerate codons to mAAToCodonMap. mCodons.add(degenerateCodon); mCodonToAAMap.put(degenerateCodon, aa); } } } } } } //--------------------------------------------------------------------------- private void fromXMLTag(XMLTag inXMLTag) { inXMLTag.verifyTagName(XML_CODON_TABLE); String name = inXMLTag.getAttributeValue(XML_NAME_ATT); if (StringUtil.isSet(name)) { setName(name); } String species = inXMLTag.getAttributeValue(XML_SPECIES_ATT); if (StringUtil.isSet(species)) { // TODO: Use the taxon id instead? Set taxons = NCBITaxon.getByName(species); if (CollectionUtil.hasValues(taxons)) { mTaxon = taxons.iterator().next(); } } List codonTags = inXMLTag.getSubtagsByName(Codon.XML_CODON); if (CollectionUtil.hasValues(codonTags)) { for (XMLTag codonTag : codonTags) { addCodon(new Codon(codonTag)); } } if (containsCodonUsageData()) { // Is bias data present? CodonUsage codonUsage = mCodonToAAMap.keySet().iterator().next().getCodonUsage(); if (null == codonUsage.getBias()) { calculateBiasValues(); } } } //--------------------------------------------------------------------------- private void clearCodonData() { mCodons.clear(); mAAToCodonsMap.clear(); mCodonToAAMap.clear(); } //--------------------------------------------------------------------------- private void parseKazusaData(InputStream inContent) throws Exception { clearCodonData(); BufferedReader reader = new BufferedReader(new InputStreamReader(inContent)); String line; while ((line = reader.readLine()) != null) { line = line.trim(); if (StringUtil.isSet(line)) { Matcher m = sKazusaLinePattern.matcher(line); int offset = 0; while (m.find(offset)) { Codon codon = new Codon(m.group(1)).setAA(m.group(2).charAt(0)); CodonUsage usage = new CodonUsage(); usage.setBias(Float.parseFloat(m.group(3))); usage.setFreqPer1000(Float.parseFloat(m.group(4))); usage.setNumber(Integer.parseInt(m.group(5))); codon.setCodonUsage(usage); addCodon(codon); if (m.end() + 1 < line.length()) { offset = m.end() + 1; } else { break; } } } } reader.close(); if (mCodons.size() != 64) { throw new RuntimeException("Problem retrieving codon data. " + mCodons.size() + " codons were parsed instead of 64!"); } } //--------------------------------------------------------------------------- private void calculateBiasValues() { for (Character aa : mAAToCodonsMap.keySet()) { Set codons = mAAToCodonsMap.get(aa); int totalNum = 0; for (Codon codon : codons) { totalNum += codon.getCodonUsage().getNumber(); } for (Codon codon : codons) { float bias = codon.getCodonUsage().getNumber() / (float) totalNum; codon.getCodonUsage().setBias(bias); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy