com.hfg.bio.taxonomy.ncbi.NCBITaxonomyDataSourceImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.taxonomy.ncbi;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.util.BooleanUtil;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
Base class for implementing an NCBI taxonomy data source.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public abstract class NCBITaxonomyDataSourceImpl implements NCBITaxonomyDataSource
{
private static final Logger LOGGER = Logger.getLogger(NCBIRemoteTaxonomyDataSource.class.getPackage().getName());
protected static final Pattern sSnCnPattern = Pattern.compile("^(.*?)\\s+\\((.*?)\\)$");
// The name map uses an Object for the value since it could be a single NCBITaxon
// or it could be a Set of multiple NCBITaxons.
private Map mNameMap;
private Map mIdMap;
protected abstract void initialize();
//-----------------------------------------------------------------------
@Override
public NCBITaxon getByTaxonId(int inValue)
{
if (null == mIdMap)
{
initialize();
}
return mIdMap.get(inValue);
}
//-----------------------------------------------------------------------
@Override
public Set getByName(String inValue)
{
if (null == mNameMap)
{
initialize();
}
Object value = mNameMap.get(inValue);
if (null == value)
{
// Didn't find it? Some sources have the scientific name
// followed by the common name in parenthesis. ex: 'Homo sapiens (human)'
// If both parts return the same taxon, call it a match.
Matcher m = sSnCnPattern.matcher(inValue);
if (m.matches())
{
value = mNameMap.get(m.group(1));
if (value != null
&& value != mNameMap.get(m.group(2)))
{
value = null;
}
}
}
Set values;
if (value instanceof NCBITaxon)
{
values = new HashSet<>(1);
values.add((NCBITaxon) value);
}
else
{
values = (Set) value;
}
return values;
}
//--------------------------------------------------------------------------
protected void innerParseNodesFile(BufferedReader inReader)
throws IOException
{
if (null == mIdMap)
{
mIdMap = new HashMap<>();
}
int lineCount = 0;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
// The StringTokenizer actually seems to perfom slightly better than split() here.
StringTokenizer st = new StringTokenizer(line, "|");
if (st.countTokens() != 13)
{
throw new RuntimeException("Found " + st.countTokens()
+ " fields instead of 13 on line "
+ lineCount + ": " + StringUtil.singleQuote(line));
}
try
{
int taxonId = Integer.parseInt(st.nextToken().trim());
NCBITaxon taxon = mIdMap.get(taxonId);
if (null == taxon)
{
taxon = new NCBITaxon(taxonId);
mIdMap.put(taxonId, taxon);
}
taxon.setParentTaxonId(Integer.parseInt(st.nextToken().trim()));
String nodeRankString = st.nextToken().trim();
NCBITaxonNodeRank nodeRank = NCBITaxonNodeRank.valueOf(nodeRankString);
if (null == nodeRank)
{
throw new RuntimeException("Unrecognized taxonomy rank: " + nodeRankString
+ "\nNode file line " + lineCount + ": '" + line + "'");
}
taxon.setTaxonomyRank(nodeRank);
taxon.setEMBL_Code(st.nextToken().trim()); // EMBL code
int divisionId = Integer.parseInt(st.nextToken().trim());
NCBIGenBankDivision division = NCBIGenBankDivision.valueOf(divisionId);
if (null == division)
{
throw new RuntimeException("Unrecognized GenBank division: " + divisionId
+ "\nNode file line " + lineCount + ": '" + line + "'");
}
taxon.setDivision(division);
taxon.setInheritedDivisionFlag(BooleanUtil.valueOf(st.nextToken().trim()));
String geneticCodeString = st.nextToken().trim();
if (StringUtil.isSet(geneticCodeString))
{
int geneticCodeId = Integer.parseInt(geneticCodeString);
taxon.setGeneticCode(NCBIGeneticCode.getById(geneticCodeId));
}
taxon.setInheritedGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim()));
String mitoGeneticCodeString = st.nextToken().trim();
if (StringUtil.isSet(mitoGeneticCodeString))
{
int geneticCodeId = Integer.parseInt(mitoGeneticCodeString);
taxon.setMitochondrialGeneticCode(NCBIGeneticCode.getById(geneticCodeId));
}
taxon.setInheritedMitochondrialGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim()));
taxon.setGenBankHiddenFlag(BooleanUtil.valueOf(st.nextToken().trim()));
taxon.setHiddenSubtreeRootFlag(BooleanUtil.valueOf(st.nextToken().trim()));
taxon.setComments(st.nextToken().trim());
}
catch (Exception e)
{
throw new RuntimeException("Error parsing nodes line " + lineCount
+ ": " + StringUtil.singleQuote(line), e);
}
}
LOGGER.log(Level.FINE, mIdMap.size() + " taxons loaded");
}
//--------------------------------------------------------------------------
protected void innerParseNamesFile(BufferedReader inReader)
throws IOException
{
if (null == mNameMap)
{
mNameMap = new HashMap<>();
}
if (null == mIdMap)
{
mIdMap = new HashMap<>();
}
int lineCount = 0;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
String pieces[] = line.split("\\|");
if (pieces.length != 4)
{
System.err.println("Found " + pieces.length + " fields instead of 4 on line "
+ lineCount + ": " + StringUtil.singleQuote(line));
continue;
}
try
{
int taxonId = Integer.parseInt(pieces[0].trim());
NCBITaxon taxon = mIdMap.get(taxonId);
if (null == taxon)
{
// throw new RuntimeException("No taxon found for id " + taxonId);
taxon = new NCBITaxon(taxonId);
mIdMap.put(taxonId, taxon);
}
String name = pieces[1].trim();
// pieces[2] is EMBL code
String nameClassString = pieces[3].trim();
NCBITaxonNameClass nameClass = NCBITaxonNameClass.valueOf(nameClassString);
if (null == nameClass)
{
throw new RuntimeException("Unrecognized name class: " + nameClassString
+ "\nNames file line " + lineCount + ": '" + line + "'");
}
if (nameClass == NCBITaxonNameClass.SCIENTIFIC_NAME)
{
taxon.setScientificName(name);
addToNameMap(name, taxon);
}
else if (nameClass == NCBITaxonNameClass.COMMON_NAME)
{
taxon.setCommonName(name);
addToNameMap(name, taxon);
}
else if (nameClass == NCBITaxonNameClass.GENBANK_COMMON_NAME)
{
taxon.setGenBankCommonName(name);
addToNameMap(name, taxon);
}
else if (nameClass == NCBITaxonNameClass.SYNONYM)
{
taxon.addSynonym(name);
addToNameMap(name, taxon);
}
// Every name should be used as a reference to the taxon.
// mNameMap.put(name.toLowerCase(), taxon);
}
catch (Exception e)
{
throw new RuntimeException("Error parsing line " + lineCount
+ ": " + StringUtil.singleQuote(line), e);
}
}
LOGGER.log(Level.FINE, mNameMap.size() + " names loaded");
}
//--------------------------------------------------------------------------
protected void addToNameMap(String inName, NCBITaxon inTaxon)
{
String key = inName.toLowerCase();
Object existingValue = mNameMap.get(key);
if (existingValue != null)
{
Set set;
if (existingValue instanceof NCBITaxon)
{
set = new HashSet<>(2);
set.add((NCBITaxon) existingValue);
mNameMap.put(key, set);
}
else
{
set = (Set) existingValue;
}
set.add(inTaxon);
}
else
{
mNameMap.put(key, inTaxon);
}
}
}