All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.taxonomy.NCBITaxon Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.taxonomy;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import com.hfg.units.TimeUnit;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.ThreadUtil;

//------------------------------------------------------------------------------
/**
 * Species class based on the NCBI taxonomy data.
 * A default set of files are included as resources.
 * May take several seconds to initialize from the full data files the first time
 * an uncommon species is queried.
 * 
* @author J. Alex Taylor, hairyfatguy.com *
*/ //------------------------------------------------------------------------------ // com.hfg XML/HTML Coding Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class NCBITaxon implements Comparable { private static final Logger LOGGER = Logger.getLogger(NCBITaxon.class.getName()); static { // initialize() must execute ahead of the following common value declarations. initialize(); } // Shortcuts to some common organisms /** Human */ public static final NCBITaxon HOMO_SAPIENS = getByTaxonId(9606); /** Mouse */ public static final NCBITaxon MUS_MUSCULUS = getByTaxonId(10090); /** Rat */ public static final NCBITaxon RATTUS_NORVEGICUS = getByTaxonId(10116); /** Hamster */ public static final NCBITaxon CRICETULUS_GRISEUS = getByTaxonId(10029); /** Cow */ public static final NCBITaxon BOS_TAURUS = getByTaxonId(9913); /** Xenopus (African clawed frog) */ public static final NCBITaxon XENOPUS_LAEVIS = getByTaxonId(8355); /** Drosophila (Fruit fly) */ public static final NCBITaxon DROSOPHILA_MELANOGASTER = getByTaxonId(7227); /** E. Coli */ public static final NCBITaxon ESCHERICHIA_COLI = getByTaxonId(562); /** Yeast */ public static final NCBITaxon SACCHAROMYCES_CEREVISIAE = getByTaxonId(4932); /** Dog */ public static final NCBITaxon CANIS_FAMILIARIS = getByTaxonId(9615); /** Chimpanzee */ public static final NCBITaxon PAN_TROGLODYTES = getByTaxonId(9598); /** Rabbit */ public static final NCBITaxon ORYCTOLAGUS_CUNICULUS = getByTaxonId(9986); /** Rhesus monkey */ public static final NCBITaxon MACACA_MULATTA = getByTaxonId(9544); /** Camel */ public static final NCBITaxon CAMELUS_DROMEDARIUS = getByTaxonId(9838); // If you add to these common defs, add to sCommonSet below and regenerate the short dump files. /** Mammals */ public static final NCBITaxon MAMMALS = getByTaxonId(40674); /** Primates */ public static final NCBITaxon PRIMATES = getByTaxonId(9443); /** Rodents */ public static final NCBITaxon RODENTS = getByTaxonId(9989); /** Unknown / unidentified */ public static final NCBITaxon UNKNOWN = getByTaxonId(32644); /** Synthetic construct / artificial sequence */ public static final NCBITaxon SYNTHETIC_CONSTRUCT = getByTaxonId(32630); //************************************************************************** // PRIVATE FIELDS //************************************************************************** private int mTaxonId; private int mParentTaxonId; private String mScientificName; private String mCommonName; private String mGenBankCommonName; private NCBITaxonNodeRank mNodeRank = NCBITaxonNodeRank.NO_RANK; private NCBIGenBankDivision mDivision; // private NCBIGeneticCode mGeneticCode; // private NCBIGeneticCode mMitochondrialGeneticCode; // private String mComments; private static boolean sInitializing = false; private static Map sNameIndex; private static Map sIdIndex; private static File sCustomNodesFile; // A user-specified file private static File sCustomNamesFile; // A user-specified file private static URL sCustomNodesURL; // A user-specified URL private static URL sCustomNamesURL; // A user-specified URL // Can take 30-60 sec to parse the full files on a really old CPU. The values for the common // taxons specified above are kept in these MUCH shorter files. If the user // sticks to the common values everything will be much quicker. private static boolean sShortFileLoaded; private static boolean sFullFileLoaded; private static Set sCommonSet = new HashSet(); private static final Pattern sSnCnPattern = Pattern.compile("^(.*?)\\s+\\((.*?)\\)$"); private static boolean sRemoteInitialization; private static URL sNCBI_URL; private static final String NODES_FILE = "rsrc/nodes.dmp.gz"; private static final String NAMES_FILE = "rsrc/names.dmp.gz"; private static final String COMMON_NODES_FILE = "rsrc/nodes_short.dmp.gz"; private static final String COMMON_NAMES_FILE = "rsrc/names_short.dmp.gz"; private static final Random sRandom = new Random(System.currentTimeMillis()); private static final String NL = System.getProperty("line.separator"); static { try { sNCBI_URL = new URL("ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"); } catch (MalformedURLException e) { throw new RuntimeException(e); } sCommonSet.add(HOMO_SAPIENS); sCommonSet.add(MUS_MUSCULUS); sCommonSet.add(RATTUS_NORVEGICUS); sCommonSet.add(CRICETULUS_GRISEUS); sCommonSet.add(BOS_TAURUS); sCommonSet.add(XENOPUS_LAEVIS); sCommonSet.add(DROSOPHILA_MELANOGASTER); sCommonSet.add(ESCHERICHIA_COLI); sCommonSet.add(SACCHAROMYCES_CEREVISIAE); sCommonSet.add(CANIS_FAMILIARIS); sCommonSet.add(PAN_TROGLODYTES); sCommonSet.add(ORYCTOLAGUS_CUNICULUS); sCommonSet.add(MACACA_MULATTA); } //************************************************************************** // CONSTRUCTORS //************************************************************************** //-------------------------------------------------------------------------- private NCBITaxon(int inTaxonId) { mTaxonId = inTaxonId; sIdIndex.put(inTaxonId + "", this); } //************************************************************************** // PUBLIC FUNCTIONS //************************************************************************** //--------------------------------------------------------------------------- public static Logger getLogger() { return LOGGER; } //-------------------------------------------------------------------------- /** * Enables remote initialization of the taxonomy data. Useful for keeping up-to-date * but much slower to initialize and potentially more dangerous if the format changes * or there are network problems. */ public static void enableRemoteInitialization() { sRemoteInitialization = true; sFullFileLoaded = false; } //-------------------------------------------------------------------------- /** Specifies the URL to use for finding the taxonomy data if remote initialization is enabled. Defaults to ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip. @param inValue the URL of the file to use as source of taxonomy data */ public static void setRemoteZipDumpFileURL(URL inValue) { sNCBI_URL = inValue; LOGGER.log(Level.FINE, "Remote Zip dump file URL set to " + StringUtil.singleQuote(sNCBI_URL)); } //-------------------------------------------------------------------------- /** * Used to load a newer version of NCBI's nodes.dmp file (availible from * ftp://ftp.ncbi.nih.gov/pub/taxonomy/). The file may be * gzip compressed. @param inValue the File containing the taxonomy nodes data */ public static synchronized void setNodesFile(File inValue) { sCustomNodesFile = inValue; // Clear the indexes so that the new file will be loaded clearIndexes(); } //-------------------------------------------------------------------------- /** * Used to load a newer version of NCBI's names.dmp file. (availible from * ftp://ftp.ncbi.nih.gov/pub/taxonomy/). The file may be * gzip compressed. @param inValue the File containing the taxonomy names data */ public static synchronized void setNamesFile(File inValue) { sCustomNamesFile = inValue; // Clear the indexes so that the new file will be loaded clearIndexes(); } //-------------------------------------------------------------------------- /** * Used to load a newer version of NCBI's nodes.dmp file (availible from * ftp://ftp.ncbi.nih.gov/pub/taxonomy/) via the specified URL. The URL may be * gzip compressed. Useful if integrating a newer nodes.dmp file as a resource. @param inValue the URL of the source containing the taxonomy nodes data */ public static synchronized void setNodesFile(URL inValue) { sCustomNodesURL = inValue; // Clear the indexes so that the new file will be loaded clearIndexes(); } //-------------------------------------------------------------------------- /** * Used to load a newer version of NCBI's names.dmp file (availible from * ftp://ftp.ncbi.nih.gov/pub/taxonomy/) via the specified URL. The URL may be * gzip compressed. Useful if integrating a newer nodes.dmp file as a resource. @param inValue the URL of the source containing the taxonomy names data */ public static synchronized void setNamesFile(URL inValue) { sCustomNamesURL = inValue; // Clear the indexes so that the new file will be loaded clearIndexes(); } //-------------------------------------------------------------------------- /** * Retrieves the NCBITaxon for the specifed common name, scientific name, * or GenBank common name. @param inValue the species name (common or scientific) for the taxon object to return @return the taxon object corresponding to the specified name. Returns null if a match cannot be found. */ public static NCBITaxon getByName(String inValue) { NCBITaxon taxon = null; if (StringUtil.isSet(inValue)) { // Lowercase the value so we can compare the names case-insensitively. inValue = inValue.toLowerCase(); while (sInitializing) { ThreadUtil.sleep(TimeUnit.second.getMilliseconds()); } if (null == sNameIndex) { initialize(); } taxon = sNameIndex.get(inValue); if (null == taxon && null == sCustomNamesFile && ! sFullFileLoaded) { initialize(); taxon = sNameIndex.get(inValue); } if (null == taxon) { // Maybe they're shouting. Try lowercasing the string since many of the common names are lowercased. taxon = sNameIndex.get(inValue.toLowerCase()); if (null == taxon) { // Still haven't found it? Some sources have the scientific name // followed by the common name in parenthesis. ex: 'Homo sapiens (human)' // If both parts return the same taxon, call it a match. Matcher m = sSnCnPattern.matcher(inValue); if (m.matches()) { taxon = sNameIndex.get(m.group(1)); if (taxon != null && taxon != sNameIndex.get(m.group(2))) { taxon = null; } } } } } return taxon; } //-------------------------------------------------------------------------- /** Returns the taxon for the specified NCBI taxon id. @param inValue the taxon id for the taxon object to retrieve @return the taxon object corresponding to the specified id */ public static NCBITaxon getByTaxonId(int inValue) { while (sInitializing) { ThreadUtil.sleep(TimeUnit.second.getMilliseconds()); } if (null == sIdIndex) { initialize(); } NCBITaxon taxon = sIdIndex.get(inValue + ""); if (null == taxon && null == sCustomNodesFile && ! sFullFileLoaded) { initialize(); taxon = sIdIndex.get(inValue + ""); } return taxon; } //-------------------------------------------------------------------------- /** Returns an unmodifiable Collection of the common taxons (those defined as class constants). @return the small collection of frequently used taxon objects */ public static Collection getCommonSet() { return Collections.unmodifiableCollection(sCommonSet); } //-------------------------------------------------------------------------- @Override public String toString() { StringBuilder buffer = new StringBuilder(); buffer.append(mTaxonId); buffer.append(" "); buffer.append(mScientificName); if (mGenBankCommonName != null) { buffer.append(" ("); buffer.append(mGenBankCommonName); buffer.append(")"); } return buffer.toString(); } //-------------------------------------------------------------------------- public int getTaxonId() { return mTaxonId; } //-------------------------------------------------------------------------- public String getFullTaxonomy() { StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("; "); if (mParentTaxonId != 1) { NCBITaxon parentTaxon = getParentTaxon(); buffer.append(parentTaxon.getFullTaxonomy()); } if (getTaxonomyRank() != NCBITaxonNodeRank.NO_RANK) { buffer.delimitedAppend(getTaxonomyRank()); buffer.append(" "); buffer.append(getScientificName()); } return buffer.toString(); } //-------------------------------------------------------------------------- public boolean isSubtaxonOf(NCBITaxon inTaxon2) { boolean result = false; NCBITaxon currentTaxon = this; while (currentTaxon != null && currentTaxon.getTaxonId() != 1) { currentTaxon = currentTaxon.getParentTaxon(); if (currentTaxon != null && currentTaxon.equals(inTaxon2)) { result = true; break; } } return result; } //-------------------------------------------------------------------------- public NCBITaxon getFirstCommonTaxon(NCBITaxon inTaxon2) { NCBITaxon firstCommonTaxon = null; Set taxonSet = new HashSet<>(); NCBITaxon currentTaxon = this; while (currentTaxon != null) { taxonSet.add(currentTaxon); currentTaxon = currentTaxon.getParentTaxon(); } // Now walk up the 2nd taxon's branch until we find a taxon in common. currentTaxon = inTaxon2; while (currentTaxon != null) { if (taxonSet.contains(currentTaxon)) { firstCommonTaxon = currentTaxon; break; } currentTaxon = currentTaxon.getParentTaxon(); } return firstCommonTaxon; } //-------------------------------------------------------------------------- public NCBITaxon getParentTaxon() { return getByTaxonId(mParentTaxonId); } //-------------------------------------------------------------------------- public int getParentTaxonId() { return mParentTaxonId; } //-------------------------------------------------------------------------- public String getScientificName() { return mScientificName; } //-------------------------------------------------------------------------- public String getCommonName() { return mCommonName; } //-------------------------------------------------------------------------- public NCBITaxon setCommonName(String inValue) { mCommonName = inValue; sNameIndex.put(mCommonName.toLowerCase(), this); return this; } //-------------------------------------------------------------------------- public String getGenBankCommonName() { return mGenBankCommonName; } //-------------------------------------------------------------------------- public NCBITaxonNodeRank getTaxonomyRank() { return mNodeRank; } //-------------------------------------------------------------------------- public NCBIGenBankDivision getDivision() { return mDivision; } /* //-------------------------------------------------------------------------- public String getComments() { return mComments; } */ //-------------------------------------------------------------------------- @Override public boolean equals(Object inObj) { if (this == inObj) return true; if (inObj == null || getClass() != inObj.getClass()) return false; final NCBITaxon ncbiTaxon = (NCBITaxon) inObj; if (mTaxonId != ncbiTaxon.mTaxonId) return false; return true; } //-------------------------------------------------------------------------- @Override public int hashCode() { return mTaxonId; } //-------------------------------------------------------------------------- public int compareTo(NCBITaxon inObj) { int result = 0; if (inObj != null) { if (mDivision != null && mDivision.name() != null) { if (inObj.mDivision != null && inObj.mDivision.name() != null) { result = mDivision.name().compareTo(inObj.mDivision.name()); } else { result = 1; } } else { result = -1; } } else { result = 1; } if (0 == result) { if (mTaxonId > inObj.mTaxonId) { result = 1; } else if (mTaxonId < inObj.mTaxonId) { result = -1; } } return result; } //-------------------------------------------------------------------------- /** Creates a subset of the nodes file containing just the specified taxon ids (and their parent taxon ids). @param inTaxonIds the list of id to extract from the taxonomy data @param inDestFile the nodes file to which the extracted taxon data should be written */ public static void extractFromNodesFile(Set inTaxonIds, File inDestFile) { Writer fileWriter = null; try { try { fileWriter = new FileWriter(inDestFile); BufferedReader nodeReader = null; try { nodeReader = getNodesReader(); String line; while ((line = nodeReader.readLine()) != null) { String pieces[] = line.split("\\s*\\|\\s*"); if (inTaxonIds.contains(new Integer(pieces[0]))) { fileWriter.write(line); fileWriter.write(NL); } } } finally { if (nodeReader != null) nodeReader.close(); } } finally { if (fileWriter != null) fileWriter.close(); } } catch (IOException e) { throw new RuntimeException("Error parsing node file.", e); } } //-------------------------------------------------------------------------- /** Creates a subset of the names file containing just the specified taxon ids (and their parent taxon ids). @param inTaxonIds the list of id to extract from the taxonomy data @param inDestFile the names file to which the extracted taxon data should be written */ public static void extractFromNamesFile(Set inTaxonIds, File inDestFile) { Writer fileWriter = null; try { try { fileWriter = new FileWriter(inDestFile); BufferedReader nodeReader = null; try { nodeReader = getNamesReader(); String line; while ((line = nodeReader.readLine()) != null) { String pieces[] = line.split("\\s*\\|\\s*"); if (inTaxonIds.contains(new Integer(pieces[0]))) { fileWriter.write(line); fileWriter.write(NL); } } } finally { if (nodeReader != null) nodeReader.close(); } } finally { if (fileWriter != null) fileWriter.close(); } } catch (IOException e) { throw new RuntimeException("Error parsing names file.", e); } } //************************************************************************** // PROTECTED METHODS //************************************************************************** //-------------------------------------------------------------------------- // Not efficient but it is just for testing purposes. protected static NCBITaxon getRandomValue() { int index = sRandom.nextInt(sIdIndex.size()); Iterator iter = sIdIndex.values().iterator(); for (int i = 0; i < index; i++) { iter.next(); } return iter.next(); } //************************************************************************** // PRIVATE METHODS //************************************************************************** //-------------------------------------------------------------------------- protected static void clearIndexes() { sNameIndex = null; sIdIndex = null; sFullFileLoaded = false; } //-------------------------------------------------------------------------- private static synchronized void initialize() { sInitializing = true; if (sFullFileLoaded) { // Another thread must have just initialized the data maps. return; } if (sRemoteInitialization) { initializeRemote(); } else { sNameIndex = new HashMap<>(sShortFileLoaded ? 270000 : 500); sIdIndex = new HashMap<>(sShortFileLoaded ? 200000 : 500); parseNodesFile(); parseNamesFile(); } if (sShortFileLoaded) { sFullFileLoaded = true; } else { sShortFileLoaded = true; } sInitializing = false; } //-------------------------------------------------------------------------- private void setParentTaxonId(int inValue) { // It can't be its own parent. if (inValue != mTaxonId) mParentTaxonId = inValue; } //-------------------------------------------------------------------------- private void setScientificName(String inValue) { mScientificName = inValue; } //-------------------------------------------------------------------------- private void setGenBankCommonName(String inValue) { mGenBankCommonName = inValue; } //-------------------------------------------------------------------------- private void setTaxonomyRank(NCBITaxonNodeRank inValue) { mNodeRank = inValue; } //-------------------------------------------------------------------------- private void setDivision(NCBIGenBankDivision inValue) { mDivision = inValue; } /* //-------------------------------------------------------------------------- private void setComments(String inValue) { mComments = inValue; } */ //-------------------------------------------------------------------------- private static InputStream getFileStream(File inFile) throws IOException { if (!inFile.exists()) { throw new RuntimeException("'" + inFile + "' doesn't exist!"); } InputStream stream = new FileInputStream(inFile); if (inFile.getName().endsWith(".gz")) { stream = new GZIPInputStream(stream); } return stream; } //-------------------------------------------------------------------------- private static InputStream getResourceStream(String inResource) throws IOException { InputStream stream = NCBITaxon.class.getResourceAsStream(inResource); if (null == stream) { throw new RuntimeException("'" + inResource + "' couldn't be found!"); } if (inResource.endsWith(".gz")) { stream = new GZIPInputStream(stream); } return stream; } //-------------------------------------------------------------------------- private static BufferedReader getNodesReader() throws IOException { InputStream stream; if (sCustomNodesFile != null) { LOGGER.log(Level.FINE, "Initializing from the custom nodes file " + StringUtil.singleQuote(sCustomNodesFile) + " ..."); stream = getFileStream(sCustomNodesFile); } else if (sCustomNodesURL != null) { LOGGER.log(Level.FINE, "Initializing from the custom nodes url " + StringUtil.singleQuote(sCustomNodesURL) + " ..."); stream = sCustomNodesURL.openStream(); if (sCustomNodesURL.getPath().endsWith(".gz")) { stream = new GZIPInputStream(stream); } } else if (sShortFileLoaded) { // Load the full file LOGGER.log(Level.FINE, "Initializing from the full nodes file..."); stream = getResourceStream(NODES_FILE); } else { // Load the common (short) file LOGGER.log(Level.FINE, "Initializing from the common nodes file..."); stream = getResourceStream(COMMON_NODES_FILE); } return new BufferedReader(new InputStreamReader(stream), 1024 * 8); } //-------------------------------------------------------------------------- private static BufferedReader getNamesReader() throws IOException { InputStream stream; if (sCustomNamesFile != null) { stream = getFileStream(sCustomNamesFile); } else if (sCustomNamesURL != null) { stream = sCustomNamesURL.openStream(); if (sCustomNamesURL.getPath().endsWith(".gz")) { stream = new GZIPInputStream(stream); } } else if (sShortFileLoaded) { // Load the full file stream = getResourceStream(NAMES_FILE); } else { // Load the common (short) file stream = getResourceStream(COMMON_NAMES_FILE); } return new BufferedReader(new InputStreamReader(stream), 1024 * 8); } //-------------------------------------------------------------------------- private static void parseNodesFile() { try { BufferedReader nodeReader = null; try { nodeReader = getNodesReader(); innerParseNodesFile(nodeReader); } finally { if (nodeReader != null) nodeReader.close(); } } catch (IOException e) { throw new RuntimeException("Error parsing node file.", e); } } //-------------------------------------------------------------------------- private static void parseNamesFile() { try { BufferedReader namesReader = null; try { namesReader = getNamesReader(); innerParseNamesFile(namesReader); } finally { if (namesReader != null) namesReader.close(); } } catch (IOException e) { throw new RuntimeException("Error parsing node file.", e); } } //-------------------------------------------------------------------------- private static void innerParseNodesFile(BufferedReader inReader) throws IOException { int lineCount = 0; String line; while ((line = inReader.readLine()) != null) { lineCount++; // The StringTokenizer actually seems to perfom slightly better than split() here. StringTokenizer st = new StringTokenizer(line, "|"); if (st.countTokens() != 13) { throw new RuntimeException("Found " + st.countTokens() + " fields instead of 13 on line " + lineCount + " of " + NODES_FILE); } int taxonId = Integer.parseInt(st.nextToken().trim()); NCBITaxon taxon = sIdIndex.get(taxonId + ""); if (null == taxon) { taxon = new NCBITaxon(taxonId); } taxon.setParentTaxonId(Integer.parseInt(st.nextToken().trim())); String nodeRankString = st.nextToken().trim(); NCBITaxonNodeRank nodeRank = NCBITaxonNodeRank.valueOf(nodeRankString); if (null == nodeRank) { throw new RuntimeException("Unrecognized taxonomy rank: " + nodeRankString + "\nNode file line " + lineCount + ": '" + line + "'"); } taxon.setTaxonomyRank(nodeRank); st.nextToken(); // EMBL code int divisionId = Integer.parseInt(st.nextToken().trim()); NCBIGenBankDivision division = NCBIGenBankDivision.valueOf(divisionId); if (null == division) { throw new RuntimeException("Unrecognized GenBank division: " + divisionId + "\nNode file line " + lineCount + ": '" + line + "'"); } taxon.setDivision(division); } LOGGER.log(Level.FINE, sIdIndex.size() + " taxons loaded"); } //-------------------------------------------------------------------------- private static void innerParseNamesFile(BufferedReader inReader) throws IOException { int lineCount = 0; String line; while ((line = inReader.readLine()) != null) { lineCount++; String pieces[] = line.split("\\|"); if (pieces.length != 4) { System.err.println("Found " + pieces.length + " fields instead of 4 on line " + lineCount + " of " + NAMES_FILE + ": " + StringUtil.singleQuote(line)); continue; } int taxonId = Integer.parseInt(pieces[0].trim()); NCBITaxon taxon = sIdIndex.get(taxonId + ""); if (null == taxon) { // throw new RuntimeException("No taxon found for id " + taxonId); taxon = new NCBITaxon(taxonId); } String name = pieces[1].trim(); // pieces[2] is EMBL code String nameClassString = pieces[3].trim(); NCBITaxonNameClass nameClass = NCBITaxonNameClass.valueOf(nameClassString); if (null == nameClass) { throw new RuntimeException("Unrecognized name class: " + nameClassString + "\nNames file line " + lineCount + ": '" + line + "'"); } if (nameClass == NCBITaxonNameClass.SCIENTIFIC_NAME) { taxon.setScientificName(name); } else if (nameClass == NCBITaxonNameClass.COMMON_NAME) { taxon.setCommonName(name); } else if (nameClass == NCBITaxonNameClass.GENBANK_COMMON_NAME) { taxon.setGenBankCommonName(name); } // Every name should be used as a reference to the taxon. sNameIndex.put(name.toLowerCase(), taxon); } LOGGER.log(Level.FINE, sNameIndex.size() + " names loaded"); } //-------------------------------------------------------------------------- private static synchronized void initializeRemote() { try { clearIndexes(); sNameIndex = new HashMap<>(500000); sIdIndex = new HashMap<>(400000); ZipInputStream zipInputStream = new ZipInputStream(sNCBI_URL.openStream()); ZipEntry zipEntry = zipInputStream.getNextEntry(); if (null == zipEntry) { throw new RuntimeException("Problem reading zipped taxonomy data from " + sNCBI_URL + " !"); } LOGGER.log(Level.FINE, "Initializing from the remote URL " + StringUtil.singleQuote(sNCBI_URL) + " ..."); while (zipEntry != null) { String entryName = zipEntry.getName(); File newFile = new File(entryName); String directory = newFile.getParent(); LOGGER.log(Level.FINE, "Zip entry:" + StringUtil.singleQuote(entryName)); if (directory == null) { if (newFile.isDirectory()) break; } if (entryName.equals("names.dmp")) { innerParseNamesFile(new BufferedReader(new InputStreamReader(zipInputStream))); } else if (entryName.equals("nodes.dmp")) { innerParseNodesFile(new BufferedReader(new InputStreamReader(zipInputStream))); } zipInputStream.closeEntry(); zipEntry = zipInputStream.getNextEntry(); } zipInputStream.close(); } catch (IOException e) { throw new RuntimeException("Problem accessing taxonomy data from " + sNCBI_URL + " :", e); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy