com.hfg.bio.taxonomy.NCBITaxon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.taxonomy;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import com.hfg.units.TimeUnit;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.ThreadUtil;
//------------------------------------------------------------------------------
/**
* Species class based on the NCBI taxonomy data.
* A default set of files are included as resources.
* May take several seconds to initialize from the full data files the first time
* an uncommon species is queried.
*
* @author J. Alex Taylor, hairyfatguy.com
*
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class NCBITaxon implements Comparable
{
private static final Logger LOGGER = Logger.getLogger(NCBITaxon.class.getName());
static
{
// initialize() must execute ahead of the following common value declarations.
initialize();
}
// Shortcuts to some common organisms
/** Human */
public static final NCBITaxon HOMO_SAPIENS = getByTaxonId(9606);
/** Mouse */
public static final NCBITaxon MUS_MUSCULUS = getByTaxonId(10090);
/** Rat */
public static final NCBITaxon RATTUS_NORVEGICUS = getByTaxonId(10116);
/** Hamster */
public static final NCBITaxon CRICETULUS_GRISEUS = getByTaxonId(10029);
/** Cow */
public static final NCBITaxon BOS_TAURUS = getByTaxonId(9913);
/** Xenopus (African clawed frog) */
public static final NCBITaxon XENOPUS_LAEVIS = getByTaxonId(8355);
/** Drosophila (Fruit fly) */
public static final NCBITaxon DROSOPHILA_MELANOGASTER = getByTaxonId(7227);
/** E. Coli */
public static final NCBITaxon ESCHERICHIA_COLI = getByTaxonId(562);
/** Yeast */
public static final NCBITaxon SACCHAROMYCES_CEREVISIAE = getByTaxonId(4932);
/** Dog */
public static final NCBITaxon CANIS_FAMILIARIS = getByTaxonId(9615);
/** Chimpanzee */
public static final NCBITaxon PAN_TROGLODYTES = getByTaxonId(9598);
/** Rabbit */
public static final NCBITaxon ORYCTOLAGUS_CUNICULUS = getByTaxonId(9986);
/** Rhesus monkey */
public static final NCBITaxon MACACA_MULATTA = getByTaxonId(9544);
/** Camel */
public static final NCBITaxon CAMELUS_DROMEDARIUS = getByTaxonId(9838);
// If you add to these common defs, add to sCommonSet below and regenerate the short dump files.
/** Mammals */
public static final NCBITaxon MAMMALS = getByTaxonId(40674);
/** Primates */
public static final NCBITaxon PRIMATES = getByTaxonId(9443);
/** Rodents */
public static final NCBITaxon RODENTS = getByTaxonId(9989);
/** Unknown / unidentified */
public static final NCBITaxon UNKNOWN = getByTaxonId(32644);
/** Synthetic construct / artificial sequence */
public static final NCBITaxon SYNTHETIC_CONSTRUCT = getByTaxonId(32630);
//**************************************************************************
// PRIVATE FIELDS
//**************************************************************************
private int mTaxonId;
private int mParentTaxonId;
private String mScientificName;
private String mCommonName;
private String mGenBankCommonName;
private NCBITaxonNodeRank mNodeRank = NCBITaxonNodeRank.NO_RANK;
private NCBIGenBankDivision mDivision;
// private NCBIGeneticCode mGeneticCode;
// private NCBIGeneticCode mMitochondrialGeneticCode;
// private String mComments;
private static boolean sInitializing = false;
private static Map sNameIndex;
private static Map sIdIndex;
private static File sCustomNodesFile; // A user-specified file
private static File sCustomNamesFile; // A user-specified file
private static URL sCustomNodesURL; // A user-specified URL
private static URL sCustomNamesURL; // A user-specified URL
// Can take 30-60 sec to parse the full files on a really old CPU. The values for the common
// taxons specified above are kept in these MUCH shorter files. If the user
// sticks to the common values everything will be much quicker.
private static boolean sShortFileLoaded;
private static boolean sFullFileLoaded;
private static Set sCommonSet = new HashSet();
private static final Pattern sSnCnPattern = Pattern.compile("^(.*?)\\s+\\((.*?)\\)$");
private static boolean sRemoteInitialization;
private static URL sNCBI_URL;
private static final String NODES_FILE = "rsrc/nodes.dmp.gz";
private static final String NAMES_FILE = "rsrc/names.dmp.gz";
private static final String COMMON_NODES_FILE = "rsrc/nodes_short.dmp.gz";
private static final String COMMON_NAMES_FILE = "rsrc/names_short.dmp.gz";
private static final Random sRandom = new Random(System.currentTimeMillis());
private static final String NL = System.getProperty("line.separator");
static
{
try
{
sNCBI_URL = new URL("ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip");
}
catch (MalformedURLException e)
{
throw new RuntimeException(e);
}
sCommonSet.add(HOMO_SAPIENS);
sCommonSet.add(MUS_MUSCULUS);
sCommonSet.add(RATTUS_NORVEGICUS);
sCommonSet.add(CRICETULUS_GRISEUS);
sCommonSet.add(BOS_TAURUS);
sCommonSet.add(XENOPUS_LAEVIS);
sCommonSet.add(DROSOPHILA_MELANOGASTER);
sCommonSet.add(ESCHERICHIA_COLI);
sCommonSet.add(SACCHAROMYCES_CEREVISIAE);
sCommonSet.add(CANIS_FAMILIARIS);
sCommonSet.add(PAN_TROGLODYTES);
sCommonSet.add(ORYCTOLAGUS_CUNICULUS);
sCommonSet.add(MACACA_MULATTA);
}
//**************************************************************************
// CONSTRUCTORS
//**************************************************************************
//--------------------------------------------------------------------------
private NCBITaxon(int inTaxonId)
{
mTaxonId = inTaxonId;
sIdIndex.put(inTaxonId + "", this);
}
//**************************************************************************
// PUBLIC FUNCTIONS
//**************************************************************************
//---------------------------------------------------------------------------
public static Logger getLogger()
{
return LOGGER;
}
//--------------------------------------------------------------------------
/**
* Enables remote initialization of the taxonomy data. Useful for keeping up-to-date
* but much slower to initialize and potentially more dangerous if the format changes
* or there are network problems.
*/
public static void enableRemoteInitialization()
{
sRemoteInitialization = true;
sFullFileLoaded = false;
}
//--------------------------------------------------------------------------
/**
Specifies the URL to use for finding the taxonomy data if remote initialization
is enabled.
Defaults to ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip.
@param inValue the URL of the file to use as source of taxonomy data
*/
public static void setRemoteZipDumpFileURL(URL inValue)
{
sNCBI_URL = inValue;
LOGGER.log(Level.FINE, "Remote Zip dump file URL set to " + StringUtil.singleQuote(sNCBI_URL));
}
//--------------------------------------------------------------------------
/**
* Used to load a newer version of NCBI's nodes.dmp file (availible from
* ftp://ftp.ncbi.nih.gov/pub/taxonomy/). The file may be
* gzip compressed.
@param inValue the File containing the taxonomy nodes data
*/
public static synchronized void setNodesFile(File inValue)
{
sCustomNodesFile = inValue;
// Clear the indexes so that the new file will be loaded
clearIndexes();
}
//--------------------------------------------------------------------------
/**
* Used to load a newer version of NCBI's names.dmp file. (availible from
* ftp://ftp.ncbi.nih.gov/pub/taxonomy/). The file may be
* gzip compressed.
@param inValue the File containing the taxonomy names data
*/
public static synchronized void setNamesFile(File inValue)
{
sCustomNamesFile = inValue;
// Clear the indexes so that the new file will be loaded
clearIndexes();
}
//--------------------------------------------------------------------------
/**
* Used to load a newer version of NCBI's nodes.dmp file (availible from
* ftp://ftp.ncbi.nih.gov/pub/taxonomy/) via the specified URL. The URL may be
* gzip compressed. Useful if integrating a newer nodes.dmp file as a resource.
@param inValue the URL of the source containing the taxonomy nodes data
*/
public static synchronized void setNodesFile(URL inValue)
{
sCustomNodesURL = inValue;
// Clear the indexes so that the new file will be loaded
clearIndexes();
}
//--------------------------------------------------------------------------
/**
* Used to load a newer version of NCBI's names.dmp file (availible from
* ftp://ftp.ncbi.nih.gov/pub/taxonomy/) via the specified URL. The URL may be
* gzip compressed. Useful if integrating a newer nodes.dmp file as a resource.
@param inValue the URL of the source containing the taxonomy names data
*/
public static synchronized void setNamesFile(URL inValue)
{
sCustomNamesURL = inValue;
// Clear the indexes so that the new file will be loaded
clearIndexes();
}
//--------------------------------------------------------------------------
/**
* Retrieves the NCBITaxon for the specifed common name, scientific name,
* or GenBank common name.
@param inValue the species name (common or scientific) for the taxon object to return
@return the taxon object corresponding to the specified name. Returns null if a match cannot be found.
*/
public static NCBITaxon getByName(String inValue)
{
NCBITaxon taxon = null;
if (StringUtil.isSet(inValue))
{
// Lowercase the value so we can compare the names case-insensitively.
inValue = inValue.toLowerCase();
while (sInitializing)
{
ThreadUtil.sleep(TimeUnit.second.getMilliseconds());
}
if (null == sNameIndex)
{
initialize();
}
taxon = sNameIndex.get(inValue);
if (null == taxon
&& null == sCustomNamesFile
&& ! sFullFileLoaded)
{
initialize();
taxon = sNameIndex.get(inValue);
}
if (null == taxon)
{
// Maybe they're shouting. Try lowercasing the string since many of the common names are lowercased.
taxon = sNameIndex.get(inValue.toLowerCase());
if (null == taxon)
{
// Still haven't found it? Some sources have the scientific name
// followed by the common name in parenthesis. ex: 'Homo sapiens (human)'
// If both parts return the same taxon, call it a match.
Matcher m = sSnCnPattern.matcher(inValue);
if (m.matches())
{
taxon = sNameIndex.get(m.group(1));
if (taxon != null
&& taxon != sNameIndex.get(m.group(2)))
{
taxon = null;
}
}
}
}
}
return taxon;
}
//--------------------------------------------------------------------------
/**
Returns the taxon for the specified NCBI taxon id.
@param inValue the taxon id for the taxon object to retrieve
@return the taxon object corresponding to the specified id
*/
public static NCBITaxon getByTaxonId(int inValue)
{
while (sInitializing)
{
ThreadUtil.sleep(TimeUnit.second.getMilliseconds());
}
if (null == sIdIndex)
{
initialize();
}
NCBITaxon taxon = sIdIndex.get(inValue + "");
if (null == taxon
&& null == sCustomNodesFile
&& ! sFullFileLoaded)
{
initialize();
taxon = sIdIndex.get(inValue + "");
}
return taxon;
}
//--------------------------------------------------------------------------
/**
Returns an unmodifiable Collection of the common taxons (those defined as class constants).
@return the small collection of frequently used taxon objects
*/
public static Collection getCommonSet()
{
return Collections.unmodifiableCollection(sCommonSet);
}
//--------------------------------------------------------------------------
@Override
public String toString()
{
StringBuilder buffer = new StringBuilder();
buffer.append(mTaxonId);
buffer.append(" ");
buffer.append(mScientificName);
if (mGenBankCommonName != null)
{
buffer.append(" (");
buffer.append(mGenBankCommonName);
buffer.append(")");
}
return buffer.toString();
}
//--------------------------------------------------------------------------
public int getTaxonId()
{
return mTaxonId;
}
//--------------------------------------------------------------------------
public String getFullTaxonomy()
{
StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("; ");
if (mParentTaxonId != 1)
{
NCBITaxon parentTaxon = getParentTaxon();
buffer.append(parentTaxon.getFullTaxonomy());
}
if (getTaxonomyRank() != NCBITaxonNodeRank.NO_RANK)
{
buffer.delimitedAppend(getTaxonomyRank());
buffer.append(" ");
buffer.append(getScientificName());
}
return buffer.toString();
}
//--------------------------------------------------------------------------
public boolean isSubtaxonOf(NCBITaxon inTaxon2)
{
boolean result = false;
NCBITaxon currentTaxon = this;
while (currentTaxon != null
&& currentTaxon.getTaxonId() != 1)
{
currentTaxon = currentTaxon.getParentTaxon();
if (currentTaxon != null
&& currentTaxon.equals(inTaxon2))
{
result = true;
break;
}
}
return result;
}
//--------------------------------------------------------------------------
public NCBITaxon getFirstCommonTaxon(NCBITaxon inTaxon2)
{
NCBITaxon firstCommonTaxon = null;
Set taxonSet = new HashSet<>();
NCBITaxon currentTaxon = this;
while (currentTaxon != null)
{
taxonSet.add(currentTaxon);
currentTaxon = currentTaxon.getParentTaxon();
}
// Now walk up the 2nd taxon's branch until we find a taxon in common.
currentTaxon = inTaxon2;
while (currentTaxon != null)
{
if (taxonSet.contains(currentTaxon))
{
firstCommonTaxon = currentTaxon;
break;
}
currentTaxon = currentTaxon.getParentTaxon();
}
return firstCommonTaxon;
}
//--------------------------------------------------------------------------
public NCBITaxon getParentTaxon()
{
return getByTaxonId(mParentTaxonId);
}
//--------------------------------------------------------------------------
public int getParentTaxonId()
{
return mParentTaxonId;
}
//--------------------------------------------------------------------------
public String getScientificName()
{
return mScientificName;
}
//--------------------------------------------------------------------------
public String getCommonName()
{
return mCommonName;
}
//--------------------------------------------------------------------------
public NCBITaxon setCommonName(String inValue)
{
mCommonName = inValue;
sNameIndex.put(mCommonName.toLowerCase(), this);
return this;
}
//--------------------------------------------------------------------------
public String getGenBankCommonName()
{
return mGenBankCommonName;
}
//--------------------------------------------------------------------------
public NCBITaxonNodeRank getTaxonomyRank()
{
return mNodeRank;
}
//--------------------------------------------------------------------------
public NCBIGenBankDivision getDivision()
{
return mDivision;
}
/*
//--------------------------------------------------------------------------
public String getComments()
{
return mComments;
}
*/
//--------------------------------------------------------------------------
@Override
public boolean equals(Object inObj)
{
if (this == inObj) return true;
if (inObj == null || getClass() != inObj.getClass()) return false;
final NCBITaxon ncbiTaxon = (NCBITaxon) inObj;
if (mTaxonId != ncbiTaxon.mTaxonId) return false;
return true;
}
//--------------------------------------------------------------------------
@Override
public int hashCode()
{
return mTaxonId;
}
//--------------------------------------------------------------------------
public int compareTo(NCBITaxon inObj)
{
int result = 0;
if (inObj != null)
{
if (mDivision != null
&& mDivision.name() != null)
{
if (inObj.mDivision != null
&& inObj.mDivision.name() != null)
{
result = mDivision.name().compareTo(inObj.mDivision.name());
}
else
{
result = 1;
}
}
else
{
result = -1;
}
}
else
{
result = 1;
}
if (0 == result)
{
if (mTaxonId > inObj.mTaxonId)
{
result = 1;
}
else if (mTaxonId < inObj.mTaxonId)
{
result = -1;
}
}
return result;
}
//--------------------------------------------------------------------------
/**
Creates a subset of the nodes file containing just the specified taxon ids (and their parent taxon ids).
@param inTaxonIds the list of id to extract from the taxonomy data
@param inDestFile the nodes file to which the extracted taxon data should be written
*/
public static void extractFromNodesFile(Set inTaxonIds, File inDestFile)
{
Writer fileWriter = null;
try
{
try
{
fileWriter = new FileWriter(inDestFile);
BufferedReader nodeReader = null;
try
{
nodeReader = getNodesReader();
String line;
while ((line = nodeReader.readLine()) != null)
{
String pieces[] = line.split("\\s*\\|\\s*");
if (inTaxonIds.contains(new Integer(pieces[0])))
{
fileWriter.write(line);
fileWriter.write(NL);
}
}
}
finally
{
if (nodeReader != null) nodeReader.close();
}
}
finally
{
if (fileWriter != null) fileWriter.close();
}
}
catch (IOException e)
{
throw new RuntimeException("Error parsing node file.", e);
}
}
//--------------------------------------------------------------------------
/**
Creates a subset of the names file containing just the specified taxon ids (and their parent taxon ids).
@param inTaxonIds the list of id to extract from the taxonomy data
@param inDestFile the names file to which the extracted taxon data should be written
*/
public static void extractFromNamesFile(Set inTaxonIds, File inDestFile)
{
Writer fileWriter = null;
try
{
try
{
fileWriter = new FileWriter(inDestFile);
BufferedReader nodeReader = null;
try
{
nodeReader = getNamesReader();
String line;
while ((line = nodeReader.readLine()) != null)
{
String pieces[] = line.split("\\s*\\|\\s*");
if (inTaxonIds.contains(new Integer(pieces[0])))
{
fileWriter.write(line);
fileWriter.write(NL);
}
}
}
finally
{
if (nodeReader != null) nodeReader.close();
}
}
finally
{
if (fileWriter != null) fileWriter.close();
}
}
catch (IOException e)
{
throw new RuntimeException("Error parsing names file.", e);
}
}
//**************************************************************************
// PROTECTED METHODS
//**************************************************************************
//--------------------------------------------------------------------------
// Not efficient but it is just for testing purposes.
protected static NCBITaxon getRandomValue()
{
int index = sRandom.nextInt(sIdIndex.size());
Iterator iter = sIdIndex.values().iterator();
for (int i = 0; i < index; i++)
{
iter.next();
}
return iter.next();
}
//**************************************************************************
// PRIVATE METHODS
//**************************************************************************
//--------------------------------------------------------------------------
protected static void clearIndexes()
{
sNameIndex = null;
sIdIndex = null;
sFullFileLoaded = false;
}
//--------------------------------------------------------------------------
private static synchronized void initialize()
{
sInitializing = true;
if (sFullFileLoaded)
{
// Another thread must have just initialized the data maps.
return;
}
if (sRemoteInitialization)
{
initializeRemote();
}
else
{
sNameIndex = new HashMap<>(sShortFileLoaded ? 270000 : 500);
sIdIndex = new HashMap<>(sShortFileLoaded ? 200000 : 500);
parseNodesFile();
parseNamesFile();
}
if (sShortFileLoaded)
{
sFullFileLoaded = true;
}
else
{
sShortFileLoaded = true;
}
sInitializing = false;
}
//--------------------------------------------------------------------------
private void setParentTaxonId(int inValue)
{
// It can't be its own parent.
if (inValue != mTaxonId) mParentTaxonId = inValue;
}
//--------------------------------------------------------------------------
private void setScientificName(String inValue)
{
mScientificName = inValue;
}
//--------------------------------------------------------------------------
private void setGenBankCommonName(String inValue)
{
mGenBankCommonName = inValue;
}
//--------------------------------------------------------------------------
private void setTaxonomyRank(NCBITaxonNodeRank inValue)
{
mNodeRank = inValue;
}
//--------------------------------------------------------------------------
private void setDivision(NCBIGenBankDivision inValue)
{
mDivision = inValue;
}
/*
//--------------------------------------------------------------------------
private void setComments(String inValue)
{
mComments = inValue;
}
*/
//--------------------------------------------------------------------------
private static InputStream getFileStream(File inFile)
throws IOException
{
if (!inFile.exists())
{
throw new RuntimeException("'" + inFile + "' doesn't exist!");
}
InputStream stream = new FileInputStream(inFile);
if (inFile.getName().endsWith(".gz"))
{
stream = new GZIPInputStream(stream);
}
return stream;
}
//--------------------------------------------------------------------------
private static InputStream getResourceStream(String inResource)
throws IOException
{
InputStream stream = NCBITaxon.class.getResourceAsStream(inResource);
if (null == stream)
{
throw new RuntimeException("'" + inResource + "' couldn't be found!");
}
if (inResource.endsWith(".gz"))
{
stream = new GZIPInputStream(stream);
}
return stream;
}
//--------------------------------------------------------------------------
private static BufferedReader getNodesReader()
throws IOException
{
InputStream stream;
if (sCustomNodesFile != null)
{
LOGGER.log(Level.FINE, "Initializing from the custom nodes file " + StringUtil.singleQuote(sCustomNodesFile) + " ...");
stream = getFileStream(sCustomNodesFile);
}
else if (sCustomNodesURL != null)
{
LOGGER.log(Level.FINE, "Initializing from the custom nodes url " + StringUtil.singleQuote(sCustomNodesURL) + " ...");
stream = sCustomNodesURL.openStream();
if (sCustomNodesURL.getPath().endsWith(".gz"))
{
stream = new GZIPInputStream(stream);
}
}
else if (sShortFileLoaded)
{
// Load the full file
LOGGER.log(Level.FINE, "Initializing from the full nodes file...");
stream = getResourceStream(NODES_FILE);
}
else
{
// Load the common (short) file
LOGGER.log(Level.FINE, "Initializing from the common nodes file...");
stream = getResourceStream(COMMON_NODES_FILE);
}
return new BufferedReader(new InputStreamReader(stream), 1024 * 8);
}
//--------------------------------------------------------------------------
private static BufferedReader getNamesReader()
throws IOException
{
InputStream stream;
if (sCustomNamesFile != null)
{
stream = getFileStream(sCustomNamesFile);
}
else if (sCustomNamesURL != null)
{
stream = sCustomNamesURL.openStream();
if (sCustomNamesURL.getPath().endsWith(".gz"))
{
stream = new GZIPInputStream(stream);
}
}
else if (sShortFileLoaded)
{
// Load the full file
stream = getResourceStream(NAMES_FILE);
}
else
{
// Load the common (short) file
stream = getResourceStream(COMMON_NAMES_FILE);
}
return new BufferedReader(new InputStreamReader(stream), 1024 * 8);
}
//--------------------------------------------------------------------------
private static void parseNodesFile()
{
try
{
BufferedReader nodeReader = null;
try
{
nodeReader = getNodesReader();
innerParseNodesFile(nodeReader);
}
finally
{
if (nodeReader != null) nodeReader.close();
}
}
catch (IOException e)
{
throw new RuntimeException("Error parsing node file.", e);
}
}
//--------------------------------------------------------------------------
private static void parseNamesFile()
{
try
{
BufferedReader namesReader = null;
try
{
namesReader = getNamesReader();
innerParseNamesFile(namesReader);
}
finally
{
if (namesReader != null) namesReader.close();
}
}
catch (IOException e)
{
throw new RuntimeException("Error parsing node file.", e);
}
}
//--------------------------------------------------------------------------
private static void innerParseNodesFile(BufferedReader inReader)
throws IOException
{
int lineCount = 0;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
// The StringTokenizer actually seems to perfom slightly better than split() here.
StringTokenizer st = new StringTokenizer(line, "|");
if (st.countTokens() != 13)
{
throw new RuntimeException("Found " + st.countTokens()
+ " fields instead of 13 on line "
+ lineCount + " of " + NODES_FILE);
}
int taxonId = Integer.parseInt(st.nextToken().trim());
NCBITaxon taxon = sIdIndex.get(taxonId + "");
if (null == taxon)
{
taxon = new NCBITaxon(taxonId);
}
taxon.setParentTaxonId(Integer.parseInt(st.nextToken().trim()));
String nodeRankString = st.nextToken().trim();
NCBITaxonNodeRank nodeRank = NCBITaxonNodeRank.valueOf(nodeRankString);
if (null == nodeRank)
{
throw new RuntimeException("Unrecognized taxonomy rank: " + nodeRankString
+ "\nNode file line " + lineCount + ": '" + line + "'");
}
taxon.setTaxonomyRank(nodeRank);
st.nextToken(); // EMBL code
int divisionId = Integer.parseInt(st.nextToken().trim());
NCBIGenBankDivision division = NCBIGenBankDivision.valueOf(divisionId);
if (null == division)
{
throw new RuntimeException("Unrecognized GenBank division: " + divisionId
+ "\nNode file line " + lineCount + ": '" + line + "'");
}
taxon.setDivision(division);
}
LOGGER.log(Level.FINE, sIdIndex.size() + " taxons loaded");
}
//--------------------------------------------------------------------------
private static void innerParseNamesFile(BufferedReader inReader)
throws IOException
{
int lineCount = 0;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
String pieces[] = line.split("\\|");
if (pieces.length != 4)
{
System.err.println("Found " + pieces.length + " fields instead of 4 on line " + lineCount
+ " of " + NAMES_FILE + ": " + StringUtil.singleQuote(line));
continue;
}
int taxonId = Integer.parseInt(pieces[0].trim());
NCBITaxon taxon = sIdIndex.get(taxonId + "");
if (null == taxon)
{
// throw new RuntimeException("No taxon found for id " + taxonId);
taxon = new NCBITaxon(taxonId);
}
String name = pieces[1].trim();
// pieces[2] is EMBL code
String nameClassString = pieces[3].trim();
NCBITaxonNameClass nameClass = NCBITaxonNameClass.valueOf(nameClassString);
if (null == nameClass)
{
throw new RuntimeException("Unrecognized name class: " + nameClassString
+ "\nNames file line " + lineCount + ": '" + line + "'");
}
if (nameClass == NCBITaxonNameClass.SCIENTIFIC_NAME)
{
taxon.setScientificName(name);
}
else if (nameClass == NCBITaxonNameClass.COMMON_NAME)
{
taxon.setCommonName(name);
}
else if (nameClass == NCBITaxonNameClass.GENBANK_COMMON_NAME)
{
taxon.setGenBankCommonName(name);
}
// Every name should be used as a reference to the taxon.
sNameIndex.put(name.toLowerCase(), taxon);
}
LOGGER.log(Level.FINE, sNameIndex.size() + " names loaded");
}
//--------------------------------------------------------------------------
private static synchronized void initializeRemote()
{
try
{
clearIndexes();
sNameIndex = new HashMap<>(500000);
sIdIndex = new HashMap<>(400000);
ZipInputStream zipInputStream = new ZipInputStream(sNCBI_URL.openStream());
ZipEntry zipEntry = zipInputStream.getNextEntry();
if (null == zipEntry)
{
throw new RuntimeException("Problem reading zipped taxonomy data from " + sNCBI_URL + " !");
}
LOGGER.log(Level.FINE, "Initializing from the remote URL " + StringUtil.singleQuote(sNCBI_URL) + " ...");
while (zipEntry != null)
{
String entryName = zipEntry.getName();
File newFile = new File(entryName);
String directory = newFile.getParent();
LOGGER.log(Level.FINE, "Zip entry:" + StringUtil.singleQuote(entryName));
if (directory == null)
{
if (newFile.isDirectory())
break;
}
if (entryName.equals("names.dmp"))
{
innerParseNamesFile(new BufferedReader(new InputStreamReader(zipInputStream)));
}
else if (entryName.equals("nodes.dmp"))
{
innerParseNodesFile(new BufferedReader(new InputStreamReader(zipInputStream)));
}
zipInputStream.closeEntry();
zipEntry = zipInputStream.getNextEntry();
}
zipInputStream.close();
}
catch (IOException e)
{
throw new RuntimeException("Problem accessing taxonomy data from " + sNCBI_URL + " :", e);
}
}
}