All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.taxonomy.UniprotSpecies Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.taxonomy;

import com.hfg.util.StringUtil;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;


//------------------------------------------------------------------------------
/**
 * Species class for Uniprot codes. Uses the speclist.txt provided with Uniprot
 * to map the species codes that are a part of the locus name (Ex: 'HUMAN' in the locus 'TNF_HUMAN').
 * 
* @author J. Alex Taylor, hairyfatguy.com *
*/ //------------------------------------------------------------------------------ // com.hfg XML/HTML Coding Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class UniprotSpecies { //************************************************************************** // PRIVATE FIELDS //************************************************************************** private String mSpeciesCode; private char mKingdomCode; private Integer mTaxonId; private String mScientificName; private String mCommonName; private String mSynonym; private static Map sCodeIndex; private static File sSpeciesFile; private static final String SPECIES_FILE = "rsrc/speclist.txt.gz"; //************************************************************************** // CONSTRUCTORS //************************************************************************** //-------------------------------------------------------------------------- private UniprotSpecies(String inCode) { mSpeciesCode = inCode; sCodeIndex.put(inCode, this); } //************************************************************************** // PUBLIC METHODS //************************************************************************** //-------------------------------------------------------------------------- /** * Used to load a newer version of the speclist.txt file. The file may be * gzip compressed. File found at * ftp://www.expasy.ch/databases/uniprot/knowledgebase/docs/speclist.txt @param inValue the file to use as the source of Uniprot species data */ public static void setSpeciesListFile(File inValue) { sSpeciesFile = inValue; // Clear the indexes so that the new file will be loaded sCodeIndex = null; } //-------------------------------------------------------------------------- /** Retrieves the UniprotSpecies for the specified loucs (Ex: 'TNF_HUMAN'). @param inValue the Uniprot locus name for the species object to return @return the species object corresponding to the specified Uniprot locus. */ public static UniprotSpecies getByLocus(String inValue) { initialize(); Pattern p = Pattern.compile("^\\S+_(\\S+)$"); Matcher m = p.matcher(inValue); if (!m.matches()) { throw new RuntimeException("The locus '" + inValue + "' is not in the proper format!"); } return sCodeIndex.get(m.group(1)); } //-------------------------------------------------------------------------- /** Retrieves the UniprotSpecies for the specified species code (Ex: 'HUMAN' from the locus 'TNF_HUMAN'). @param inValue the Uniprot code for the species object to return @return the species object corresponding to the specified Uniprot code. */ public static UniprotSpecies getByCode(String inValue) { initialize(); return (StringUtil.isSet(inValue) ? sCodeIndex.get(inValue.toUpperCase()) : null); } //-------------------------------------------------------------------------- public String getSpeciesCode() { return mSpeciesCode; } //-------------------------------------------------------------------------- public char getKingdomCode() { return mKingdomCode; } //-------------------------------------------------------------------------- /** Returns the taxon id. The value is equal to the NCBI taxon id. @return the taxon id */ public Integer getTaxonId() { return mTaxonId; } //-------------------------------------------------------------------------- public String getScientificName() { return mScientificName; } //-------------------------------------------------------------------------- public String getCommonName() { return mCommonName; } //-------------------------------------------------------------------------- public String getSynonym() { return mSynonym; } //************************************************************************** // PRIVATE METHODS //************************************************************************** //-------------------------------------------------------------------------- private static void initialize() { if (null == sCodeIndex) { sCodeIndex = new HashMap(15000); parseSpeciesFile(); System.out.println(sCodeIndex.size() + " Uniprot species codes loaded"); } } //-------------------------------------------------------------------------- private void setScientificName(String inValue) { mScientificName = inValue; } //-------------------------------------------------------------------------- private void setCommonName(String inValue) { mCommonName = inValue; } //-------------------------------------------------------------------------- private void setSynonym(String inValue) { mSynonym = inValue; } //-------------------------------------------------------------------------- private void setTaxonId(int inValue) { mTaxonId = inValue; } //-------------------------------------------------------------------------- private void setKingdomCode(char inValue) { mKingdomCode = inValue; } //-------------------------------------------------------------------------- private static InputStream getFileStream(File inFile) throws IOException { if (!inFile.exists()) { throw new RuntimeException("'" + inFile + "' doesn't exist!"); } InputStream stream = new FileInputStream(inFile); if (inFile.getName().endsWith(".gz")) { stream = new GZIPInputStream(stream); } return stream; } //-------------------------------------------------------------------------- private static InputStream getResourceStream(String inResource) throws IOException { InputStream stream = UniprotSpecies.class.getResourceAsStream(inResource); if (null == stream) { throw new RuntimeException("'" + inResource + "' couldn't be found!"); } if (inResource.endsWith(".gz")) { stream = new GZIPInputStream(stream); } return stream; } //-------------------------------------------------------------------------- private static BufferedReader getFileStream() throws IOException { InputStream stream; if (sSpeciesFile != null) { stream = getFileStream(sSpeciesFile); } else { stream = getResourceStream(SPECIES_FILE); } return new BufferedReader(new InputStreamReader(stream)); } //-------------------------------------------------------------------------- private static void parseSpeciesFile() { UniprotSpecies entry = null; Pattern nLinePattern = Pattern.compile("^(\\S{3,5})\\s+(\\w)\\s+(\\d+|\\?+):\\s+N=(.+)"); Pattern cLinePattern = Pattern.compile("^\\s+C=(.+)"); Pattern sLinePattern = Pattern.compile("^\\s+S=(.+)"); int lineCount = 0; try { BufferedReader fileReader = null; try { fileReader = getFileStream(); boolean inHeader = true; String line; while ((line = fileReader.readLine()) != null) { lineCount++; if (inHeader) { if (line.startsWith("_____ ")) { inHeader = false; continue; } } else if (line.startsWith("--------")) { // Hit the copyright at the end. break; } Matcher m = nLinePattern.matcher(line); if (m.matches()) { entry = new UniprotSpecies(m.group(1)); sCodeIndex.put(m.group(1), entry); entry.setKingdomCode(m.group(2).charAt(0)); if (!m.group(3).startsWith("?")) { entry.setTaxonId(Integer.parseInt(m.group(3))); } entry.setScientificName(m.group(4)); } else if (entry != null) { m = cLinePattern.matcher(line); if (m.matches()) { entry.setCommonName(m.group(1)); } else { m = sLinePattern.matcher(line); if (m.matches()) { entry.setSynonym(m.group(1)); } } } } } finally { if (fileReader != null) fileReader.close(); } } catch (IOException e) { throw new RuntimeException("Error parsing species file. line: " + lineCount, e); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy