com.hfg.bio.seq.genomic.assembly.NCBIGenomicAssemblyInfo Maven / Gradle / Ivy
Show all versions of com_hfg Show documentation
package com.hfg.bio.seq.genomic.assembly;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
import com.hfg.util.BooleanUtil;
import com.hfg.util.CompareUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.DataRow;
import com.hfg.util.collection.DataTable;
import com.hfg.util.io.TSV;
import com.hfg.xml.HfgXML;
import com.hfg.xml.XMLName;
import com.hfg.xml.XMLTag;
//------------------------------------------------------------------------------
/**
* Information about an NCBI genomic assembly.
* See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the fields.
*
* @author J. Alex Taylor, hairyfatguy.com
*
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class NCBIGenomicAssemblyInfo implements Comparable
{
// Tag names
private static final XMLName NCBI_GENOMIC_ASSEMBLY_INFO = new XMLName("NCBIGenomicAssemblyInfo", HfgXML.HFG_NAMESPACE);
private static final XMLName ASSEMBLY_ACCESSION = new XMLName("assembly_accession", HfgXML.HFG_NAMESPACE);
private static final XMLName BIOPROJECT_ACCESSION = new XMLName("bioproject", HfgXML.HFG_NAMESPACE);
private static final XMLName BIOSAMPLE_ACCESSION = new XMLName("biosample", HfgXML.HFG_NAMESPACE);
private static final XMLName WGS_ACCESSION = new XMLName("wgs_master", HfgXML.HFG_NAMESPACE);
private static final XMLName REFSEQ_CATEGORY = new XMLName("refseq_category", HfgXML.HFG_NAMESPACE);
private static final XMLName ORGANISM_TAXON = new XMLName("taxid", HfgXML.HFG_NAMESPACE);
private static final XMLName SPECIES_TAXON = new XMLName("species_taxid", HfgXML.HFG_NAMESPACE);
private static final XMLName INFRASPECIFIC_NAME = new XMLName("infraspecific_name", HfgXML.HFG_NAMESPACE);
private static final XMLName ISOLATE = new XMLName("isolate", HfgXML.HFG_NAMESPACE);
private static final XMLName ASSEMBLY_VERSION_STATUS = new XMLName("version_status", HfgXML.HFG_NAMESPACE);
private static final XMLName ASSEMBLY_LEVEL = new XMLName("assembly_level", HfgXML.HFG_NAMESPACE);
private static final XMLName ASSEMBLY_RELEASE_TYPE = new XMLName("release_type", HfgXML.HFG_NAMESPACE);
private static final XMLName GENOMIC_REPRESENTATION = new XMLName("genome_rep", HfgXML.HFG_NAMESPACE);
private static final XMLName RELEASE_DATE = new XMLName("seq_rel_date", HfgXML.HFG_NAMESPACE);
private static final XMLName ASSEMBLY_NAME = new XMLName("asm_name", HfgXML.HFG_NAMESPACE);
private static final XMLName SUBMITTER = new XMLName("asm_submitter", HfgXML.HFG_NAMESPACE);
private static final XMLName GENBANK_REFSEQ_PAIRED_ASSEMBLY = new XMLName("gbrs_paired_asm", HfgXML.HFG_NAMESPACE);
private static final XMLName GENOMIC_PAIRED_ASSEMBLY_COMPARISON = new XMLName("paired_asm_comp", HfgXML.HFG_NAMESPACE);
private static final XMLName FTP_PATH = new XMLName("ftp_path", HfgXML.HFG_NAMESPACE);
private static final XMLName EXCLUDED_FROM_REFSEQ = new XMLName("excluded_from_refseq", HfgXML.HFG_NAMESPACE);
private static final XMLName GENOMIC_RELATION_TO_TYPE_MATERIAL = new XMLName("relation_to_type_material", HfgXML.HFG_NAMESPACE);
private static final XMLName ASM_NOT_LIVE_DATE = new XMLName("asm_not_live_date", HfgXML.HFG_NAMESPACE);
private static final XMLName ASSEMBLY_TYPE = new XMLName("assembly_type", HfgXML.HFG_NAMESPACE);
private static final XMLName GROUP = new XMLName("group", HfgXML.HFG_NAMESPACE);
private static final XMLName GENOME_SIZE = new XMLName("genome_size", HfgXML.HFG_NAMESPACE);
private static final XMLName GENOME_SIZE_UNGAPPED = new XMLName("genome_size_ungapped", HfgXML.HFG_NAMESPACE);
private static final XMLName GC_PERCENT = new XMLName("gc_percent", HfgXML.HFG_NAMESPACE);
private static final XMLName REPLICON_COUNT = new XMLName("replicon_count", HfgXML.HFG_NAMESPACE);
private static final XMLName SCAFFOLD_COUNT = new XMLName("scaffold_count", HfgXML.HFG_NAMESPACE);
private static final XMLName CONTIG_COUNT = new XMLName("contig_count", HfgXML.HFG_NAMESPACE);
private static final XMLName ANNOTATION_PROVIDER = new XMLName("annotation_provider", HfgXML.HFG_NAMESPACE);
private static final XMLName ANNOTATION_NAME = new XMLName("annotation_name", HfgXML.HFG_NAMESPACE);
private static final XMLName ANNOTATION_DATE = new XMLName("annotation_date", HfgXML.HFG_NAMESPACE);
private static final XMLName TOTAL_GENE_COUNT = new XMLName("total_gene_count", HfgXML.HFG_NAMESPACE);
private static final XMLName PROTEIN_CODING_GENE_COUNT = new XMLName("protein_coding_gene_count", HfgXML.HFG_NAMESPACE);
private static final XMLName NON_CODING_GENE_COUNT = new XMLName("non_coding_gene_count", HfgXML.HFG_NAMESPACE);
private static final XMLName PUBMED_ID = new XMLName("pubmed_id", HfgXML.HFG_NAMESPACE);
private String mAssemblyAccession;
private String mBioprojectAccession;
private String mBiosampleAccession;
private String mWGSAccession;
private GenomicRefSeqCategory mRefseqCategory;
private NCBITaxon mOrganismNCBITaxon;
private NCBITaxon mSpeciesNCBITaxon; // The species taxid will differ from the
// organism taxid (column 6) only when the
// organism was reported at a sub-species or strain level.
private String mInfraspecificName;
private String mIsolate;
private GenomicAssemblyVersionStatus mAssemblyVersionStatus;
private GenomicAssemblyLevel mAssemblyLevel;
private GenomicAssemblyReleaseType mAssemblyReleaseType;
private GenomicRepresentation mGenomicRepresentation;
private Date mReleaseDate;
private String mAssemblyName;
private String mSubmitter;
private String mGenBankRefSeqPairedAssembly;
private GenomicPairedAssemblyComparison mGenomicPairedAssemblyComparison;
private String mFTP_Path;
private Boolean mExcludedFromRefSeq;
private GenomicRelationToTypeMaterial mGenomicRelationToTypeMaterial;
private Date mAssemblyNotLiveDate;
private String mAssemblyType;
private String mTaxonomyGroup;
private Long mGenomeSize;
private Long mUngappedGenomeSize;
private Float mGCPercent;
private Integer mRepliconCount;
private Integer mScaffoldCount;
private Integer mContigCount;
private String mAnnotationProvider;
private String mAnnotationName;
private Date mAnnotationDate;
private Integer mTotalGeneCount;
private Integer mProteinCodingGeneCount;
private Integer mNonCodingGeneCount;
private String mPubMedId;
private DataRow mDataRow;
private SimpleDateFormat mDateFormat = new SimpleDateFormat("YYYY-MM-dd");
private static SimpleDateFormat YYYYMMDD_FORMAT = new SimpleDateFormat("yyyy/MM/dd");
//##########################################################################
// CONSTRUCTORS
//##########################################################################
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo()
{
mDataRow = new DataRow();
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo(DataRow inDataRow)
throws ParseException
{
mDataRow = inDataRow;
setAssemblyAccession(inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_ACCESSION.getLocalName())));
setBioprojectAccession(inDataRow.getString(inDataRow.getDataColumn(BIOPROJECT_ACCESSION.getLocalName())));
setBiosampleAccession(inDataRow.getString(inDataRow.getDataColumn(BIOSAMPLE_ACCESSION.getLocalName())));
setWGSAccession(inDataRow.getString(inDataRow.getDataColumn(WGS_ACCESSION.getLocalName())));
String stringValue = inDataRow.getString(inDataRow.getDataColumn(REFSEQ_CATEGORY.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setRefSeqCategory(GenomicRefSeqCategory.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(ORGANISM_TAXON.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setOrganismTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(stringValue)));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(ORGANISM_TAXON.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setSpeciesTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(stringValue)));
}
// Skipping field 7 (the species name) since is part of the taxon info
setInfraspecificName(inDataRow.getString(inDataRow.getDataColumn(INFRASPECIFIC_NAME.getLocalName())));
setIsolate(inDataRow.getString(inDataRow.getDataColumn(ISOLATE.getLocalName())));
stringValue = inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_VERSION_STATUS.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setAssemblyVersionStatus(GenomicAssemblyVersionStatus.valueOf(stringValue));
}
String assemblyLevelString = inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_LEVEL.getLocalName()));
if (StringUtil.isSet(assemblyLevelString))
{
setAssemblyLevel(GenomicAssemblyLevel.valueOf(assemblyLevelString));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_RELEASE_TYPE.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setAssemblyReleaseType(GenomicAssemblyReleaseType.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOMIC_REPRESENTATION.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setGenomicRepresentation(GenomicRepresentation.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(RELEASE_DATE.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setReleaseDate(YYYYMMDD_FORMAT.parse(stringValue)); // Ex: 2019/07/30
}
setAssemblyName(inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_NAME.getLocalName())));
setSubmitter(inDataRow.getString(inDataRow.getDataColumn(SUBMITTER.getLocalName())));
setGenBankRefSeqPairedAssembly(inDataRow.getString(inDataRow.getDataColumn(GENBANK_REFSEQ_PAIRED_ASSEMBLY.getLocalName())));
stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOMIC_PAIRED_ASSEMBLY_COMPARISON.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setPairedAssemblyComparison(GenomicPairedAssemblyComparison.valueOf(stringValue));
}
setFTP_Path(inDataRow.getString(inDataRow.getDataColumn(FTP_PATH.getLocalName())));
setExcludedFromRefSeq(BooleanUtil.valueOf(inDataRow.getString(inDataRow.getDataColumn(EXCLUDED_FROM_REFSEQ.getLocalName()))));
String relationToTypeMaterialString = inDataRow.getString(inDataRow.getDataColumn(GENOMIC_RELATION_TO_TYPE_MATERIAL.getLocalName()));
if (StringUtil.isSet(relationToTypeMaterialString))
{
setRelationToTypeMaterial(GenomicRelationToTypeMaterial.valueOf(relationToTypeMaterialString));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(ASM_NOT_LIVE_DATE.getLocalName()));
if (StringUtil.isSet(stringValue)
&& ! stringValue.equalsIgnoreCase("na"))
{
setAssemblyNotLiveDate(YYYYMMDD_FORMAT.parse(stringValue)); // Ex: 2019/07/30
}
setAssemblyType(inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_TYPE.getLocalName())));
setTaxonomyGroup(inDataRow.getString(inDataRow.getDataColumn(GROUP.getLocalName())));
stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOME_SIZE.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setGenomeSize(Long.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOME_SIZE_UNGAPPED.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setUngappedGenomeSize(Long.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(GC_PERCENT.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setGC_Percent(Float.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(REPLICON_COUNT.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setRepliconCount(Integer.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(SCAFFOLD_COUNT.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setScaffoldCount(Integer.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(CONTIG_COUNT.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setContigCount(Integer.valueOf(stringValue));
}
setAnnotationProvider(inDataRow.getString(inDataRow.getDataColumn(ANNOTATION_PROVIDER.getLocalName())));
setAnnotationName(inDataRow.getString(inDataRow.getDataColumn(ANNOTATION_NAME.getLocalName())));
stringValue = inDataRow.getString(inDataRow.getDataColumn(ANNOTATION_DATE.getLocalName()));
if (StringUtil.isSet(stringValue))
{
setAnnotationDate(YYYYMMDD_FORMAT.parse(stringValue)); // Ex: 2019/07/30
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(TOTAL_GENE_COUNT.getLocalName()));
if (StringUtil.isSet(stringValue)
&& ! stringValue.equalsIgnoreCase("na"))
{
setTotalGeneCount(Integer.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(PROTEIN_CODING_GENE_COUNT.getLocalName()));
if (StringUtil.isSet(stringValue)
&& ! stringValue.equalsIgnoreCase("na"))
{
setProteinCodingGeneCount(Integer.valueOf(stringValue));
}
stringValue = inDataRow.getString(inDataRow.getDataColumn(NON_CODING_GENE_COUNT.getLocalName()));
if (StringUtil.isSet(stringValue)
&& ! stringValue.equalsIgnoreCase("na"))
{
setNonCodingGeneCount(Integer.valueOf(stringValue));
}
setPubMedId(inDataRow.getString(inDataRow.getDataColumn(PUBMED_ID.getLocalName())));
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo(XMLTag inXMLTag)
throws ParseException
{
mDataRow = new DataRow();
inXMLTag.verifyTagName(NCBI_GENOMIC_ASSEMBLY_INFO);
XMLTag tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_ACCESSION);
setAssemblyAccession(tag.getContent().trim());
tag = inXMLTag.getRequiredSubtagByName(BIOPROJECT_ACCESSION);
setBioprojectAccession(tag.getContent().trim());
tag = inXMLTag.getRequiredSubtagByName(BIOSAMPLE_ACCESSION);
setBioprojectAccession(tag.getContent().trim());
tag = inXMLTag.getRequiredSubtagByName(WGS_ACCESSION);
setWGSAccession(tag.getContent().trim());
tag = inXMLTag.getRequiredSubtagByName(REFSEQ_CATEGORY);
setRefSeqCategory(GenomicRefSeqCategory.valueOf(tag.getContent().trim()));
tag = inXMLTag.getRequiredSubtagByName(ORGANISM_TAXON);
setOrganismTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(tag.getContent().trim())));
tag = inXMLTag.getRequiredSubtagByName(SPECIES_TAXON);
setSpeciesTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(tag.getContent().trim())));
tag = inXMLTag.getOptionalSubtagByName(INFRASPECIFIC_NAME);
if (tag != null)
{
setInfraspecificName(tag.getContent().trim());
}
tag = inXMLTag.getOptionalSubtagByName(ISOLATE);
if (tag != null)
{
setIsolate(tag.getContent().trim());
}
tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_VERSION_STATUS);
setAssemblyVersionStatus(GenomicAssemblyVersionStatus.valueOf(tag.getContent().trim()));
tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_LEVEL);
setAssemblyLevel(GenomicAssemblyLevel.valueOf(tag.getContent().trim()));
tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_RELEASE_TYPE);
setAssemblyReleaseType(GenomicAssemblyReleaseType.valueOf(tag.getContent().trim()));
tag = inXMLTag.getRequiredSubtagByName(GENOMIC_REPRESENTATION);
setGenomicRepresentation(GenomicRepresentation.valueOf(tag.getContent().trim()));
tag = inXMLTag.getRequiredSubtagByName(RELEASE_DATE);
try
{
setReleaseDate(mDateFormat.parse(tag.getContent().trim()));
}
catch (ParseException e)
{
throw new RuntimeException(e);
}
tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_NAME);
setAssemblyName(tag.getContent().trim());
tag = inXMLTag.getRequiredSubtagByName(SUBMITTER);
setSubmitter(tag.getContent().trim());
tag = inXMLTag.getOptionalSubtagByName(GENBANK_REFSEQ_PAIRED_ASSEMBLY);
if (tag != null)
{
setGenBankRefSeqPairedAssembly(tag.getContent().trim());
}
tag = inXMLTag.getOptionalSubtagByName(GENOMIC_PAIRED_ASSEMBLY_COMPARISON);
if (tag != null)
{
setPairedAssemblyComparison(GenomicPairedAssemblyComparison.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getRequiredSubtagByName(FTP_PATH);
setFTP_Path(tag.getContent().trim());
tag = inXMLTag.getRequiredSubtagByName(EXCLUDED_FROM_REFSEQ);
setExcludedFromRefSeq(BooleanUtil.valueOf(tag.getContent().trim()));
tag = inXMLTag.getOptionalSubtagByName(GENOMIC_RELATION_TO_TYPE_MATERIAL);
if (tag != null)
{
setRelationToTypeMaterial(GenomicRelationToTypeMaterial.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(ASM_NOT_LIVE_DATE);
if (tag != null)
{
setAssemblyNotLiveDate(mDateFormat.parse(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(GROUP);
if (tag != null)
{
setTaxonomyGroup(tag.getContent().trim());
}
tag = inXMLTag.getOptionalSubtagByName(GENOME_SIZE);
if (tag != null)
{
setGenomeSize(Long.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(GENOME_SIZE_UNGAPPED);
if (tag != null)
{
setUngappedGenomeSize(Long.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(GC_PERCENT);
if (tag != null)
{
setGC_Percent(Float.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(REPLICON_COUNT);
if (tag != null)
{
setRepliconCount(Integer.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(SCAFFOLD_COUNT);
if (tag != null)
{
setScaffoldCount(Integer.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(CONTIG_COUNT);
if (tag != null)
{
setContigCount(Integer.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(ANNOTATION_PROVIDER);
setAnnotationProvider(tag != null ? tag.getContent().trim() : null);
tag = inXMLTag.getOptionalSubtagByName(ANNOTATION_NAME);
setAnnotationName(tag != null ? tag.getContent().trim() : null);
tag = inXMLTag.getOptionalSubtagByName(ANNOTATION_DATE);
setAnnotationDate(tag != null ? mDateFormat.parse(tag.getContent().trim()) : null);
tag = inXMLTag.getOptionalSubtagByName(TOTAL_GENE_COUNT);
if (tag != null)
{
setTotalGeneCount(Integer.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(PROTEIN_CODING_GENE_COUNT);
if (tag != null)
{
setProteinCodingGeneCount(Integer.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(NON_CODING_GENE_COUNT);
if (tag != null)
{
setNonCodingGeneCount(Integer.valueOf(tag.getContent().trim()));
}
tag = inXMLTag.getOptionalSubtagByName(PUBMED_ID);
setPubMedId(tag != null ? tag.getContent().trim() : null);
}
//##########################################################################
// PUBLIC METHODS
//##########################################################################
//---------------------------------------------------------------------------
/**
* Use for processing an assembly_summary.txt file from the NCBI ftp site.
* @param inFile the assembly_summary.txt file
* @return a Map with species taxons as the keys
* @throws IOException
* @throws ParseException
*/
public static Map> extractInfoFromAssemblySummaryFile(File inFile)
throws IOException, ParseException
{
// Need to pre-process the content so the header doesn't get skipped as a comment.
List lines = TSV.parse(new BufferedReader(new FileReader(inFile)));
if (lines.get(0)[0].startsWith("##"))
{
lines.remove(0); // ## See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
}
if (lines.get(0)[0].startsWith("#assembly_accession"))
{
lines.get(0)[0] = lines.get(0)[0].substring(1);
}
DataTable dataTable = new DataTable(lines);
Map> assemblyMap = new HashMap<>(dataTable.rowCount());
for (DataRow dataRow : dataTable.getRows())
{
NCBIGenomicAssemblyInfo assemblyInfo = new NCBIGenomicAssemblyInfo(dataRow);
List infoList = assemblyMap.get(assemblyInfo.getSpeciesTaxon());
if (null == infoList)
{
infoList = new ArrayList<>(2);
assemblyMap.put(assemblyInfo.getSpeciesTaxon(), infoList);
}
infoList.add(assemblyInfo);
}
return assemblyMap;
}
//---------------------------------------------------------------------------
@Override
public String toString()
{
StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("\n")
.delimitedAppend("assembly_accession: " + getAssemblyAccession())
.delimitedAppend("bioproject: " + getBioprojectAccession())
.delimitedAppend("biosample: " + getBiosampleAccession())
.delimitedAppend("wgs_master: " + getWGSAccession())
.delimitedAppend("refseq_category: " + getRefSeqCategory())
.delimitedAppend("taxid: " + getOrganismTaxon().getTaxonId())
.delimitedAppend("species_taxid: " + getSpeciesTaxon().getTaxonId())
.delimitedAppend("organism_name: " + getSpeciesTaxon().getScientificName())
.delimitedAppend("infraspecific_name: " + getInfraspecificName())
.delimitedAppend("isolate: " + getIsolate())
.delimitedAppend("version_status: " + getAssemblyVersionStatus())
.delimitedAppend("assembly_level: " + getAssemblyLevel())
.delimitedAppend("release_type: " + getAssemblyReleaseType())
.delimitedAppend("genome_rep: " + getGenomicRepresentation())
.delimitedAppend("seq_rel_date: " + getReleaseDate())
.delimitedAppend("asm_name: " + getAssemblyName())
.delimitedAppend("submitter: " + getSubmitter())
.delimitedAppend("gbrs_paired_asm: " + getGenBankRefSeqPairedAssembly())
.delimitedAppend("paired_asm_comp: " + getPairedAssemblyComparison())
.delimitedAppend("ftp_path: " + getFTP_Path())
.delimitedAppend("excluded_from_refseq: " + getExcludedFromRefSeq())
.delimitedAppend("relation_to_type_material: " + (getRelationToTypeMaterial() != null ? getRelationToTypeMaterial() : ""))
.delimitedAppend("asm_not_live_date: " + getAssemblyNotLiveDate())
.delimitedAppend("assembly_type: " + getAssemblyType())
.delimitedAppend(GROUP + ": " + getTaxonomyGroup())
.delimitedAppend(GENOME_SIZE + ": " + getGenomeSize())
.delimitedAppend(GENOME_SIZE_UNGAPPED + ": " + getUngappedGenomeSize())
.delimitedAppend(GC_PERCENT + ": " + getGC_Percent())
.delimitedAppend(REPLICON_COUNT + ": " + getRepliconCount())
.delimitedAppend("pubmed_id: " + getPubMedId())
;
return buffer.toString();
}
//---------------------------------------------------------------------------
public void toTSV(File inFile)
throws Exception
{
if (mDataRow != null)
{
mDataRow.toTSV(inFile);
}
}
//---------------------------------------------------------------------------
public XMLTag toXMLTag()
{
XMLTag rootTag = new XMLTag(NCBI_GENOMIC_ASSEMBLY_INFO);
rootTag.addSubtag(ASSEMBLY_ACCESSION).setContent(getAssemblyAccession());
rootTag.addSubtag(BIOPROJECT_ACCESSION).setContent(getBioprojectAccession());
rootTag.addSubtag(BIOSAMPLE_ACCESSION).setContent(getBiosampleAccession());
rootTag.addSubtag(WGS_ACCESSION).setContent(getWGSAccession());
rootTag.addSubtag(REFSEQ_CATEGORY).setContent(getRefSeqCategory().name());
rootTag.addSubtag(ORGANISM_TAXON).setContent(getOrganismTaxon().getTaxonId());
rootTag.addSubtag(SPECIES_TAXON).setContent(getSpeciesTaxon().getTaxonId());
if (getInfraspecificName() != null)
{
rootTag.addSubtag(INFRASPECIFIC_NAME).setContent(getInfraspecificName());
}
if (getIsolate() != null)
{
rootTag.addSubtag(ISOLATE).setContent(getIsolate());
}
rootTag.addSubtag(ASSEMBLY_VERSION_STATUS).setContent(getAssemblyVersionStatus().name());
rootTag.addSubtag(ASSEMBLY_LEVEL).setContent(getAssemblyLevel().name());
rootTag.addSubtag(ASSEMBLY_RELEASE_TYPE).setContent(getAssemblyReleaseType().name());
rootTag.addSubtag(GENOMIC_REPRESENTATION).setContent(getGenomicRepresentation().name());
rootTag.addSubtag(RELEASE_DATE).setContent(mDateFormat.format(getReleaseDate()));
rootTag.addSubtag(ASSEMBLY_NAME).setContent(getAssemblyName());
rootTag.addSubtag(SUBMITTER).setContent(getSubmitter());
if (getGenBankRefSeqPairedAssembly() != null)
{
rootTag.addSubtag(GENBANK_REFSEQ_PAIRED_ASSEMBLY).setContent(getGenBankRefSeqPairedAssembly());
}
if (getPairedAssemblyComparison() != null)
{
rootTag.addSubtag(GENOMIC_PAIRED_ASSEMBLY_COMPARISON).setContent(getPairedAssemblyComparison().name());
}
rootTag.addSubtag(FTP_PATH).setContent(getFTP_Path());
rootTag.addSubtag(EXCLUDED_FROM_REFSEQ).setContent(getExcludedFromRefSeq() ? "true" : "false");
if (getRelationToTypeMaterial() != null)
{
rootTag.addSubtag(GENOMIC_RELATION_TO_TYPE_MATERIAL).setContent(getRelationToTypeMaterial().name());
}
if (getAssemblyType() != null)
{
rootTag.addSubtag(ASSEMBLY_TYPE).setContent(getAssemblyType());
}
if (getTaxonomyGroup() != null)
{
rootTag.addSubtag(GROUP).setContent(getTaxonomyGroup());
}
if (getGenomeSize() != null)
{
rootTag.addSubtag(GENOME_SIZE).setContent(getGenomeSize() + "");
}
if (getUngappedGenomeSize() != null)
{
rootTag.addSubtag(GENOME_SIZE_UNGAPPED).setContent(getUngappedGenomeSize() + "");
}
if (getGC_Percent() != null)
{
rootTag.addSubtag(GC_PERCENT).setContent(getGC_Percent() + "");
}
if (getRepliconCount() != null)
{
rootTag.addSubtag(REPLICON_COUNT).setContent(getRepliconCount() + "");
}
if (getScaffoldCount() != null)
{
rootTag.addSubtag(SCAFFOLD_COUNT).setContent(getScaffoldCount() + "");
}
if (getContigCount() != null)
{
rootTag.addSubtag(CONTIG_COUNT).setContent(getContigCount() + "");
}
if (getAnnotationProvider() != null)
{
rootTag.addSubtag(ANNOTATION_PROVIDER).setContent(getAnnotationProvider());
}
if (getAnnotationName() != null)
{
rootTag.addSubtag(ANNOTATION_NAME).setContent(getAnnotationName());
}
if (getAnnotationDate() != null)
{
rootTag.addSubtag(ANNOTATION_DATE).setContent(mDateFormat.format(getAnnotationDate()));
}
if (getTotalGeneCount() != null)
{
rootTag.addSubtag(TOTAL_GENE_COUNT).setContent(getTotalGeneCount() + "");
}
if (getProteinCodingGeneCount() != null)
{
rootTag.addSubtag(PROTEIN_CODING_GENE_COUNT).setContent(getProteinCodingGeneCount() + "");
}
if (getNonCodingGeneCount() != null)
{
rootTag.addSubtag(NON_CODING_GENE_COUNT).setContent(getNonCodingGeneCount() + "");
}
if (getPubMedId() != null)
{
rootTag.addSubtag(PUBMED_ID).setContent(getPubMedId());
}
return rootTag;
}
//---------------------------------------------------------------------------
/**
* Generic field retrieval to future-proof against future changes by the NCBI.
* @param inName the column name
* @return the String value
*/
public String getField(String inName)
{
return mDataRow != null ? mDataRow.getString(mDataRow.getDataColumn(inName)) : null;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyAccession(String inValue)
{
mAssemblyAccession = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getAssemblyAccession()
{
return mAssemblyAccession;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setBioprojectAccession(String inValue)
{
mBioprojectAccession = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getBioprojectAccession()
{
return mBioprojectAccession;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setBiosampleAccession(String inValue)
{
mBiosampleAccession = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getBiosampleAccession()
{
return mBiosampleAccession;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setWGSAccession(String inValue)
{
mWGSAccession = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getWGSAccession()
{
return mWGSAccession;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setRefSeqCategory(GenomicRefSeqCategory inValue)
{
mRefseqCategory = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicRefSeqCategory getRefSeqCategory()
{
return mRefseqCategory;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setOrganismTaxon(NCBITaxon inValue)
{
mOrganismNCBITaxon = inValue;
return this;
}
//---------------------------------------------------------------------------
public NCBITaxon getOrganismTaxon()
{
return mOrganismNCBITaxon;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setSpeciesTaxon(NCBITaxon inValue)
{
mSpeciesNCBITaxon = inValue;
return this;
}
//---------------------------------------------------------------------------
public NCBITaxon getSpeciesTaxon()
{
return mSpeciesNCBITaxon;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setInfraspecificName(String inValue)
{
mInfraspecificName = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getInfraspecificName()
{
return mInfraspecificName;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setIsolate(String inValue)
{
mIsolate = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getIsolate()
{
return mIsolate;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyVersionStatus(GenomicAssemblyVersionStatus inValue)
{
mAssemblyVersionStatus = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicAssemblyVersionStatus getAssemblyVersionStatus()
{
return mAssemblyVersionStatus;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyLevel(GenomicAssemblyLevel inValue)
{
mAssemblyLevel = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicAssemblyLevel getAssemblyLevel()
{
return mAssemblyLevel;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyReleaseType(GenomicAssemblyReleaseType inValue)
{
mAssemblyReleaseType = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicAssemblyReleaseType getAssemblyReleaseType()
{
return mAssemblyReleaseType;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setGenomicRepresentation(GenomicRepresentation inValue)
{
mGenomicRepresentation = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicRepresentation getGenomicRepresentation()
{
return mGenomicRepresentation;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setReleaseDate(Date inValue)
{
mReleaseDate = inValue;
return this;
}
//---------------------------------------------------------------------------
public Date getReleaseDate()
{
return mReleaseDate;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyName(String inValue)
{
mAssemblyName = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getAssemblyName()
{
return mAssemblyName;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setSubmitter(String inValue)
{
mSubmitter = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getSubmitter()
{
return mSubmitter;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setGenBankRefSeqPairedAssembly(String inValue)
{
mGenBankRefSeqPairedAssembly = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getGenBankRefSeqPairedAssembly()
{
return mGenBankRefSeqPairedAssembly;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setPairedAssemblyComparison(GenomicPairedAssemblyComparison inValue)
{
mGenomicPairedAssemblyComparison = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicPairedAssemblyComparison getPairedAssemblyComparison()
{
return mGenomicPairedAssemblyComparison;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setFTP_Path(String inValue)
{
mFTP_Path = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getFTP_Path()
{
return mFTP_Path;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setExcludedFromRefSeq(boolean inValue)
{
mExcludedFromRefSeq = inValue;
return this;
}
//---------------------------------------------------------------------------
public boolean getExcludedFromRefSeq()
{
return mExcludedFromRefSeq;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setRelationToTypeMaterial(GenomicRelationToTypeMaterial inValue)
{
mGenomicRelationToTypeMaterial = inValue;
return this;
}
//---------------------------------------------------------------------------
public GenomicRelationToTypeMaterial getRelationToTypeMaterial()
{
return mGenomicRelationToTypeMaterial;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyNotLiveDate(Date inValue)
{
mAssemblyNotLiveDate = inValue;
return this;
}
//---------------------------------------------------------------------------
public Date getAssemblyNotLiveDate()
{
return mAssemblyNotLiveDate;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAssemblyType(String inValue)
{
mAssemblyType = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getAssemblyType()
{
return mAssemblyType;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setTaxonomyGroup(String inValue)
{
mTaxonomyGroup = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getTaxonomyGroup()
{
return mTaxonomyGroup;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setGenomeSize(Long inValue)
{
mGenomeSize = inValue;
return this;
}
//---------------------------------------------------------------------------
public Long getGenomeSize()
{
return mGenomeSize;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setUngappedGenomeSize(Long inValue)
{
mUngappedGenomeSize = inValue;
return this;
}
//---------------------------------------------------------------------------
public Long getUngappedGenomeSize()
{
return mUngappedGenomeSize;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setGC_Percent(Float inValue)
{
mGCPercent = inValue;
return this;
}
//---------------------------------------------------------------------------
public Float getGC_Percent()
{
return mGCPercent;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setRepliconCount(Integer inValue)
{
mRepliconCount = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getRepliconCount()
{
return mRepliconCount;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setScaffoldCount(Integer inValue)
{
mScaffoldCount = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getScaffoldCount()
{
return mScaffoldCount;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setContigCount(Integer inValue)
{
mContigCount = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getContigCount()
{
return mContigCount;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAnnotationProvider(String inValue)
{
mAnnotationProvider = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getAnnotationProvider()
{
return mAnnotationProvider;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAnnotationName(String inValue)
{
mAnnotationName = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getAnnotationName()
{
return mAnnotationName;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setAnnotationDate(Date inValue)
{
mAnnotationDate = inValue;
return this;
}
//---------------------------------------------------------------------------
public Date getAnnotationDate()
{
return mAnnotationDate;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setTotalGeneCount(Integer inValue)
{
mTotalGeneCount = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getTotalGeneCount()
{
return mTotalGeneCount;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setProteinCodingGeneCount(Integer inValue)
{
mProteinCodingGeneCount = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getProteinCodingGeneCount()
{
return mProteinCodingGeneCount;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setNonCodingGeneCount(Integer inValue)
{
mNonCodingGeneCount = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getNonCodingGeneCount()
{
return mNonCodingGeneCount;
}
//---------------------------------------------------------------------------
public NCBIGenomicAssemblyInfo setPubMedId(String inValue)
{
mPubMedId = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getPubMedId()
{
return mPubMedId;
}
//---------------------------------------------------------------------------
/**
* Compares this object with the specified object for order. Returns a
* negative integer, zero, or a positive integer as this object is less
* than, equal to, or greater than the specified object.
*
* The implementor must ensure
* {@code sgn(x.compareTo(y)) == -sgn(y.compareTo(x))}
* for all {@code x} and {@code y}. (This
* implies that {@code x.compareTo(y)} must throw an exception iff
* {@code y.compareTo(x)} throws an exception.)
*
*
The implementor must also ensure that the relation is transitive:
* {@code (x.compareTo(y) > 0 && y.compareTo(z) > 0)} implies
* {@code x.compareTo(z) > 0}.
*
*
Finally, the implementor must ensure that {@code x.compareTo(y)==0}
* implies that {@code sgn(x.compareTo(z)) == sgn(y.compareTo(z))}, for
* all {@code z}.
*
*
It is strongly recommended, but not strictly required that
* {@code (x.compareTo(y)==0) == (x.equals(y))}. Generally speaking, any
* class that implements the {@code Comparable} interface and violates
* this condition should clearly indicate this fact. The recommended
* language is "Note: this class has a natural ordering that is
* inconsistent with equals."
*
*
In the foregoing description, the notation
* {@code sgn(}expression{@code )} designates the mathematical
* signum function, which is defined to return one of {@code -1},
* {@code 0}, or {@code 1} according to whether the value of
* expression is negative, zero, or positive, respectively.
*
* @param inObj2 the object to be compared.
* @return a negative integer, zero, or a positive integer as this object
* is less than, equal to, or greater than the specified object.
* @throws ClassCastException if the specified object's type prevents it
* from being compared to this object.
*/
@Override
public int compareTo(NCBIGenomicAssemblyInfo inObj2)
{
int result = -1;
if (inObj2 != null)
{
result = CompareUtil.compare(getSpeciesTaxon(), inObj2.getSpeciesTaxon());
if (0 == result)
{
result = CompareUtil.compare(getOrganismTaxon(), inObj2.getOrganismTaxon());
if (0 == result)
{
result = CompareUtil.compare(getRefSeqCategory(), inObj2.getRefSeqCategory());
if (0 == result)
{
result = CompareUtil.compare(getGenomicRepresentation(), inObj2.getGenomicRepresentation());
if (0 == result)
{
result = CompareUtil.compare(getAssemblyReleaseType(), inObj2.getAssemblyReleaseType());
if (0 == result)
{
result = CompareUtil.compare(getAssemblyVersionStatus(), inObj2.getAssemblyVersionStatus());
}
}
}
}
}
}
return result;
}
}