All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.taxonomy.ncbi.NCBITaxonomyDataSourceImpl Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.taxonomy.ncbi;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.util.BooleanUtil;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 Base class for implementing an NCBI taxonomy data source.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public abstract class NCBITaxonomyDataSourceImpl implements NCBITaxonomyDataSource { private static final Logger LOGGER = Logger.getLogger(NCBIRemoteTaxonomyDataSource.class.getPackage().getName()); protected static final Pattern sSnCnPattern = Pattern.compile("^(.*?)\\s+\\((.*?)\\)$"); // The name map uses an Object for the value since it could be a single NCBITaxon // or it could be a Set of multiple NCBITaxons. private Map mNameMap; private Map mIdMap; protected abstract void initialize(); //----------------------------------------------------------------------- @Override public NCBITaxon getByTaxonId(int inValue) { if (null == mIdMap) { initialize(); } return mIdMap.get(inValue); } //----------------------------------------------------------------------- @Override public Set getByName(String inValue) { if (null == mNameMap) { initialize(); } Object value = mNameMap.get(inValue); if (null == value) { // Didn't find it? Some sources have the scientific name // followed by the common name in parenthesis. ex: 'Homo sapiens (human)' // If both parts return the same taxon, call it a match. Matcher m = sSnCnPattern.matcher(inValue); if (m.matches()) { value = mNameMap.get(m.group(1)); if (value != null && value != mNameMap.get(m.group(2))) { value = null; } } } Set values; if (value instanceof NCBITaxon) { values = new HashSet<>(1); values.add((NCBITaxon) value); } else { values = (Set) value; } return values; } //-------------------------------------------------------------------------- protected void innerParseNodesFile(BufferedReader inReader) throws IOException { if (null == mIdMap) { mIdMap = new HashMap<>(); } int lineCount = 0; String line; while ((line = inReader.readLine()) != null) { lineCount++; // The StringTokenizer actually seems to perfom slightly better than split() here. StringTokenizer st = new StringTokenizer(line, "|"); if (st.countTokens() != 13) { throw new RuntimeException("Found " + st.countTokens() + " fields instead of 13 on line " + lineCount + ": " + StringUtil.singleQuote(line)); } try { int taxonId = Integer.parseInt(st.nextToken().trim()); NCBITaxon taxon = mIdMap.get(taxonId); if (null == taxon) { taxon = new NCBITaxon(taxonId); mIdMap.put(taxonId, taxon); } taxon.setParentTaxonId(Integer.parseInt(st.nextToken().trim())); String nodeRankString = st.nextToken().trim(); NCBITaxonNodeRank nodeRank = NCBITaxonNodeRank.valueOf(nodeRankString); if (null == nodeRank) { throw new RuntimeException("Unrecognized taxonomy rank: " + nodeRankString + "\nNode file line " + lineCount + ": '" + line + "'"); } taxon.setTaxonomyRank(nodeRank); taxon.setEMBL_Code(st.nextToken().trim()); // EMBL code int divisionId = Integer.parseInt(st.nextToken().trim()); NCBIGenBankDivision division = NCBIGenBankDivision.valueOf(divisionId); if (null == division) { throw new RuntimeException("Unrecognized GenBank division: " + divisionId + "\nNode file line " + lineCount + ": '" + line + "'"); } taxon.setDivision(division); taxon.setInheritedDivisionFlag(BooleanUtil.valueOf(st.nextToken().trim())); String geneticCodeString = st.nextToken().trim(); if (StringUtil.isSet(geneticCodeString)) { int geneticCodeId = Integer.parseInt(geneticCodeString); taxon.setGeneticCode(NCBIGeneticCode.getById(geneticCodeId)); } taxon.setInheritedGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim())); String mitoGeneticCodeString = st.nextToken().trim(); if (StringUtil.isSet(mitoGeneticCodeString)) { int geneticCodeId = Integer.parseInt(mitoGeneticCodeString); taxon.setMitochondrialGeneticCode(NCBIGeneticCode.getById(geneticCodeId)); } taxon.setInheritedMitochondrialGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim())); taxon.setGenBankHiddenFlag(BooleanUtil.valueOf(st.nextToken().trim())); taxon.setHiddenSubtreeRootFlag(BooleanUtil.valueOf(st.nextToken().trim())); taxon.setComments(st.nextToken().trim()); } catch (Exception e) { throw new RuntimeException("Error parsing nodes line " + lineCount + ": " + StringUtil.singleQuote(line), e); } } LOGGER.log(Level.FINE, mIdMap.size() + " taxons loaded"); } //-------------------------------------------------------------------------- protected void innerParseNamesFile(BufferedReader inReader) throws IOException { if (null == mNameMap) { mNameMap = new HashMap<>(); } if (null == mIdMap) { mIdMap = new HashMap<>(); } int lineCount = 0; String line; while ((line = inReader.readLine()) != null) { lineCount++; String pieces[] = line.split("\\|"); if (pieces.length != 4) { System.err.println("Found " + pieces.length + " fields instead of 4 on line " + lineCount + ": " + StringUtil.singleQuote(line)); continue; } try { int taxonId = Integer.parseInt(pieces[0].trim()); NCBITaxon taxon = mIdMap.get(taxonId); if (null == taxon) { // throw new RuntimeException("No taxon found for id " + taxonId); taxon = new NCBITaxon(taxonId); mIdMap.put(taxonId, taxon); } String name = pieces[1].trim(); // pieces[2] is EMBL code String nameClassString = pieces[3].trim(); NCBITaxonNameClass nameClass = NCBITaxonNameClass.valueOf(nameClassString); if (null == nameClass) { throw new RuntimeException("Unrecognized name class: " + nameClassString + "\nNames file line " + lineCount + ": '" + line + "'"); } if (nameClass == NCBITaxonNameClass.SCIENTIFIC_NAME) { taxon.setScientificName(name); addToNameMap(name, taxon); } else if (nameClass == NCBITaxonNameClass.COMMON_NAME) { taxon.setCommonName(name); addToNameMap(name, taxon); } else if (nameClass == NCBITaxonNameClass.GENBANK_COMMON_NAME) { taxon.setGenBankCommonName(name); addToNameMap(name, taxon); } else if (nameClass == NCBITaxonNameClass.SYNONYM) { taxon.addSynonym(name); addToNameMap(name, taxon); } // Every name should be used as a reference to the taxon. // mNameMap.put(name.toLowerCase(), taxon); } catch (Exception e) { throw new RuntimeException("Error parsing line " + lineCount + ": " + StringUtil.singleQuote(line), e); } } LOGGER.log(Level.FINE, mNameMap.size() + " names loaded"); } //-------------------------------------------------------------------------- protected void addToNameMap(String inName, NCBITaxon inTaxon) { String key = inName.toLowerCase(); Object existingValue = mNameMap.get(key); if (existingValue != null) { Set set; if (existingValue instanceof NCBITaxon) { set = new HashSet<>(2); set.add((NCBITaxon) existingValue); mNameMap.put(key, set); } else { set = (Set) existingValue; } set.add(inTaxon); } else { mNameMap.put(key, inTaxon); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy