com.hfg.bio.taxonomy.UniprotSpecies Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.taxonomy;
import com.hfg.util.StringUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
//------------------------------------------------------------------------------
/**
* Species class for Uniprot codes. Uses the speclist.txt provided with Uniprot
* to map the species codes that are a part of the locus name (Ex: 'HUMAN' in the locus 'TNF_HUMAN').
*
* @author J. Alex Taylor, hairyfatguy.com
*
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class UniprotSpecies
{
//**************************************************************************
// PRIVATE FIELDS
//**************************************************************************
private String mSpeciesCode;
private char mKingdomCode;
private Integer mTaxonId;
private String mScientificName;
private String mCommonName;
private String mSynonym;
private static Map sCodeIndex;
private static File sSpeciesFile;
private static final String SPECIES_FILE = "rsrc/speclist.txt.gz";
//**************************************************************************
// CONSTRUCTORS
//**************************************************************************
//--------------------------------------------------------------------------
private UniprotSpecies(String inCode)
{
mSpeciesCode = inCode;
sCodeIndex.put(inCode, this);
}
//**************************************************************************
// PUBLIC METHODS
//**************************************************************************
//--------------------------------------------------------------------------
/**
* Used to load a newer version of the speclist.txt file. The file may be
* gzip compressed. File found at
* ftp://www.expasy.ch/databases/uniprot/knowledgebase/docs/speclist.txt
@param inValue the file to use as the source of Uniprot species data
*/
public static void setSpeciesListFile(File inValue)
{
sSpeciesFile = inValue;
// Clear the indexes so that the new file will be loaded
sCodeIndex = null;
}
//--------------------------------------------------------------------------
/**
Retrieves the UniprotSpecies for the specified loucs (Ex: 'TNF_HUMAN').
@param inValue the Uniprot locus name for the species object to return
@return the species object corresponding to the specified Uniprot locus.
*/
public static UniprotSpecies getByLocus(String inValue)
{
initialize();
Pattern p = Pattern.compile("^\\S+_(\\S+)$");
Matcher m = p.matcher(inValue);
if (!m.matches())
{
throw new RuntimeException("The locus '" + inValue + "' is not in the proper format!");
}
return sCodeIndex.get(m.group(1));
}
//--------------------------------------------------------------------------
/**
Retrieves the UniprotSpecies for the specified species code (Ex: 'HUMAN' from the locus 'TNF_HUMAN').
@param inValue the Uniprot code for the species object to return
@return the species object corresponding to the specified Uniprot code.
*/
public static UniprotSpecies getByCode(String inValue)
{
initialize();
return (StringUtil.isSet(inValue) ? sCodeIndex.get(inValue.toUpperCase()) : null);
}
//--------------------------------------------------------------------------
public String getSpeciesCode()
{
return mSpeciesCode;
}
//--------------------------------------------------------------------------
public char getKingdomCode()
{
return mKingdomCode;
}
//--------------------------------------------------------------------------
/**
Returns the taxon id. The value is equal to the NCBI taxon id.
@return the taxon id
*/
public Integer getTaxonId()
{
return mTaxonId;
}
//--------------------------------------------------------------------------
public String getScientificName()
{
return mScientificName;
}
//--------------------------------------------------------------------------
public String getCommonName()
{
return mCommonName;
}
//--------------------------------------------------------------------------
public String getSynonym()
{
return mSynonym;
}
//**************************************************************************
// PRIVATE METHODS
//**************************************************************************
//--------------------------------------------------------------------------
private static void initialize()
{
if (null == sCodeIndex)
{
sCodeIndex = new HashMap(15000);
parseSpeciesFile();
System.out.println(sCodeIndex.size() + " Uniprot species codes loaded");
}
}
//--------------------------------------------------------------------------
private void setScientificName(String inValue)
{
mScientificName = inValue;
}
//--------------------------------------------------------------------------
private void setCommonName(String inValue)
{
mCommonName = inValue;
}
//--------------------------------------------------------------------------
private void setSynonym(String inValue)
{
mSynonym = inValue;
}
//--------------------------------------------------------------------------
private void setTaxonId(int inValue)
{
mTaxonId = inValue;
}
//--------------------------------------------------------------------------
private void setKingdomCode(char inValue)
{
mKingdomCode = inValue;
}
//--------------------------------------------------------------------------
private static InputStream getFileStream(File inFile)
throws IOException
{
if (!inFile.exists())
{
throw new RuntimeException("'" + inFile + "' doesn't exist!");
}
InputStream stream = new FileInputStream(inFile);
if (inFile.getName().endsWith(".gz"))
{
stream = new GZIPInputStream(stream);
}
return stream;
}
//--------------------------------------------------------------------------
private static InputStream getResourceStream(String inResource)
throws IOException
{
InputStream stream = UniprotSpecies.class.getResourceAsStream(inResource);
if (null == stream)
{
throw new RuntimeException("'" + inResource + "' couldn't be found!");
}
if (inResource.endsWith(".gz"))
{
stream = new GZIPInputStream(stream);
}
return stream;
}
//--------------------------------------------------------------------------
private static BufferedReader getFileStream()
throws IOException
{
InputStream stream;
if (sSpeciesFile != null)
{
stream = getFileStream(sSpeciesFile);
}
else
{
stream = getResourceStream(SPECIES_FILE);
}
return new BufferedReader(new InputStreamReader(stream));
}
//--------------------------------------------------------------------------
private static void parseSpeciesFile()
{
UniprotSpecies entry = null;
Pattern nLinePattern = Pattern.compile("^(\\S{3,5})\\s+(\\w)\\s+(\\d+|\\?+):\\s+N=(.+)");
Pattern cLinePattern = Pattern.compile("^\\s+C=(.+)");
Pattern sLinePattern = Pattern.compile("^\\s+S=(.+)");
int lineCount = 0;
try
{
BufferedReader fileReader = null;
try
{
fileReader = getFileStream();
boolean inHeader = true;
String line;
while ((line = fileReader.readLine()) != null)
{
lineCount++;
if (inHeader)
{
if (line.startsWith("_____ "))
{
inHeader = false;
continue;
}
}
else if (line.startsWith("--------"))
{
// Hit the copyright at the end.
break;
}
Matcher m = nLinePattern.matcher(line);
if (m.matches())
{
entry = new UniprotSpecies(m.group(1));
sCodeIndex.put(m.group(1), entry);
entry.setKingdomCode(m.group(2).charAt(0));
if (!m.group(3).startsWith("?"))
{
entry.setTaxonId(Integer.parseInt(m.group(3)));
}
entry.setScientificName(m.group(4));
}
else if (entry != null)
{
m = cLinePattern.matcher(line);
if (m.matches())
{
entry.setCommonName(m.group(1));
}
else
{
m = sLinePattern.matcher(line);
if (m.matches())
{
entry.setSynonym(m.group(1));
}
}
}
}
}
finally
{
if (fileReader != null) fileReader.close();
}
}
catch (IOException e)
{
throw new RuntimeException("Error parsing species file. line: " + lineCount, e);
}
}
}