All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.genomic.assembly.NCBIGenomicAssemblyInfo Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.genomic.assembly;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
import com.hfg.util.BooleanUtil;
import com.hfg.util.CompareUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.DataRow;
import com.hfg.util.collection.DataTable;
import com.hfg.util.io.TSV;
import com.hfg.xml.HfgXML;
import com.hfg.xml.XMLName;
import com.hfg.xml.XMLTag;

//------------------------------------------------------------------------------
/**
 * Information about an NCBI genomic assembly.
 * See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the fields.
 * 
* @author J. Alex Taylor, hairyfatguy.com *
*/ //------------------------------------------------------------------------------ // com.hfg XML/HTML Coding Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class NCBIGenomicAssemblyInfo implements Comparable { // Tag names private static final XMLName NCBI_GENOMIC_ASSEMBLY_INFO = new XMLName("NCBIGenomicAssemblyInfo", HfgXML.HFG_NAMESPACE); private static final XMLName ASSEMBLY_ACCESSION = new XMLName("assembly_accession", HfgXML.HFG_NAMESPACE); private static final XMLName BIOPROJECT_ACCESSION = new XMLName("bioproject", HfgXML.HFG_NAMESPACE); private static final XMLName BIOSAMPLE_ACCESSION = new XMLName("biosample", HfgXML.HFG_NAMESPACE); private static final XMLName WGS_ACCESSION = new XMLName("wgs_master", HfgXML.HFG_NAMESPACE); private static final XMLName REFSEQ_CATEGORY = new XMLName("refseq_category", HfgXML.HFG_NAMESPACE); private static final XMLName ORGANISM_TAXON = new XMLName("taxid", HfgXML.HFG_NAMESPACE); private static final XMLName SPECIES_TAXON = new XMLName("species_taxid", HfgXML.HFG_NAMESPACE); private static final XMLName INFRASPECIFIC_NAME = new XMLName("infraspecific_name", HfgXML.HFG_NAMESPACE); private static final XMLName ISOLATE = new XMLName("isolate", HfgXML.HFG_NAMESPACE); private static final XMLName ASSEMBLY_VERSION_STATUS = new XMLName("version_status", HfgXML.HFG_NAMESPACE); private static final XMLName ASSEMBLY_LEVEL = new XMLName("assembly_level", HfgXML.HFG_NAMESPACE); private static final XMLName ASSEMBLY_RELEASE_TYPE = new XMLName("release_type", HfgXML.HFG_NAMESPACE); private static final XMLName GENOMIC_REPRESENTATION = new XMLName("genome_rep", HfgXML.HFG_NAMESPACE); private static final XMLName RELEASE_DATE = new XMLName("seq_rel_date", HfgXML.HFG_NAMESPACE); private static final XMLName ASSEMBLY_NAME = new XMLName("asm_name", HfgXML.HFG_NAMESPACE); private static final XMLName SUBMITTER = new XMLName("asm_submitter", HfgXML.HFG_NAMESPACE); private static final XMLName GENBANK_REFSEQ_PAIRED_ASSEMBLY = new XMLName("gbrs_paired_asm", HfgXML.HFG_NAMESPACE); private static final XMLName GENOMIC_PAIRED_ASSEMBLY_COMPARISON = new XMLName("paired_asm_comp", HfgXML.HFG_NAMESPACE); private static final XMLName FTP_PATH = new XMLName("ftp_path", HfgXML.HFG_NAMESPACE); private static final XMLName EXCLUDED_FROM_REFSEQ = new XMLName("excluded_from_refseq", HfgXML.HFG_NAMESPACE); private static final XMLName GENOMIC_RELATION_TO_TYPE_MATERIAL = new XMLName("relation_to_type_material", HfgXML.HFG_NAMESPACE); private static final XMLName ASM_NOT_LIVE_DATE = new XMLName("asm_not_live_date", HfgXML.HFG_NAMESPACE); private static final XMLName ASSEMBLY_TYPE = new XMLName("assembly_type", HfgXML.HFG_NAMESPACE); private static final XMLName GROUP = new XMLName("group", HfgXML.HFG_NAMESPACE); private static final XMLName GENOME_SIZE = new XMLName("genome_size", HfgXML.HFG_NAMESPACE); private static final XMLName GENOME_SIZE_UNGAPPED = new XMLName("genome_size_ungapped", HfgXML.HFG_NAMESPACE); private static final XMLName GC_PERCENT = new XMLName("gc_percent", HfgXML.HFG_NAMESPACE); private static final XMLName REPLICON_COUNT = new XMLName("replicon_count", HfgXML.HFG_NAMESPACE); private static final XMLName SCAFFOLD_COUNT = new XMLName("scaffold_count", HfgXML.HFG_NAMESPACE); private static final XMLName CONTIG_COUNT = new XMLName("contig_count", HfgXML.HFG_NAMESPACE); private static final XMLName ANNOTATION_PROVIDER = new XMLName("annotation_provider", HfgXML.HFG_NAMESPACE); private static final XMLName ANNOTATION_NAME = new XMLName("annotation_name", HfgXML.HFG_NAMESPACE); private static final XMLName ANNOTATION_DATE = new XMLName("annotation_date", HfgXML.HFG_NAMESPACE); private static final XMLName TOTAL_GENE_COUNT = new XMLName("total_gene_count", HfgXML.HFG_NAMESPACE); private static final XMLName PROTEIN_CODING_GENE_COUNT = new XMLName("protein_coding_gene_count", HfgXML.HFG_NAMESPACE); private static final XMLName NON_CODING_GENE_COUNT = new XMLName("non_coding_gene_count", HfgXML.HFG_NAMESPACE); private static final XMLName PUBMED_ID = new XMLName("pubmed_id", HfgXML.HFG_NAMESPACE); private String mAssemblyAccession; private String mBioprojectAccession; private String mBiosampleAccession; private String mWGSAccession; private GenomicRefSeqCategory mRefseqCategory; private NCBITaxon mOrganismNCBITaxon; private NCBITaxon mSpeciesNCBITaxon; // The species taxid will differ from the // organism taxid (column 6) only when the // organism was reported at a sub-species or strain level. private String mInfraspecificName; private String mIsolate; private GenomicAssemblyVersionStatus mAssemblyVersionStatus; private GenomicAssemblyLevel mAssemblyLevel; private GenomicAssemblyReleaseType mAssemblyReleaseType; private GenomicRepresentation mGenomicRepresentation; private Date mReleaseDate; private String mAssemblyName; private String mSubmitter; private String mGenBankRefSeqPairedAssembly; private GenomicPairedAssemblyComparison mGenomicPairedAssemblyComparison; private String mFTP_Path; private Boolean mExcludedFromRefSeq; private GenomicRelationToTypeMaterial mGenomicRelationToTypeMaterial; private Date mAssemblyNotLiveDate; private String mAssemblyType; private String mTaxonomyGroup; private Long mGenomeSize; private Long mUngappedGenomeSize; private Float mGCPercent; private Integer mRepliconCount; private Integer mScaffoldCount; private Integer mContigCount; private String mAnnotationProvider; private String mAnnotationName; private Date mAnnotationDate; private Integer mTotalGeneCount; private Integer mProteinCodingGeneCount; private Integer mNonCodingGeneCount; private String mPubMedId; private DataRow mDataRow; private SimpleDateFormat mDateFormat = new SimpleDateFormat("YYYY-MM-dd"); private static SimpleDateFormat YYYYMMDD_FORMAT = new SimpleDateFormat("yyyy/MM/dd"); //########################################################################## // CONSTRUCTORS //########################################################################## //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo() { mDataRow = new DataRow(); } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo(DataRow inDataRow) throws ParseException { mDataRow = inDataRow; setAssemblyAccession(inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_ACCESSION.getLocalName()))); setBioprojectAccession(inDataRow.getString(inDataRow.getDataColumn(BIOPROJECT_ACCESSION.getLocalName()))); setBiosampleAccession(inDataRow.getString(inDataRow.getDataColumn(BIOSAMPLE_ACCESSION.getLocalName()))); setWGSAccession(inDataRow.getString(inDataRow.getDataColumn(WGS_ACCESSION.getLocalName()))); String stringValue = inDataRow.getString(inDataRow.getDataColumn(REFSEQ_CATEGORY.getLocalName())); if (StringUtil.isSet(stringValue)) { setRefSeqCategory(GenomicRefSeqCategory.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(ORGANISM_TAXON.getLocalName())); if (StringUtil.isSet(stringValue)) { setOrganismTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(stringValue))); } stringValue = inDataRow.getString(inDataRow.getDataColumn(ORGANISM_TAXON.getLocalName())); if (StringUtil.isSet(stringValue)) { setSpeciesTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(stringValue))); } // Skipping field 7 (the species name) since is part of the taxon info setInfraspecificName(inDataRow.getString(inDataRow.getDataColumn(INFRASPECIFIC_NAME.getLocalName()))); setIsolate(inDataRow.getString(inDataRow.getDataColumn(ISOLATE.getLocalName()))); stringValue = inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_VERSION_STATUS.getLocalName())); if (StringUtil.isSet(stringValue)) { setAssemblyVersionStatus(GenomicAssemblyVersionStatus.valueOf(stringValue)); } String assemblyLevelString = inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_LEVEL.getLocalName())); if (StringUtil.isSet(assemblyLevelString)) { setAssemblyLevel(GenomicAssemblyLevel.valueOf(assemblyLevelString)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_RELEASE_TYPE.getLocalName())); if (StringUtil.isSet(stringValue)) { setAssemblyReleaseType(GenomicAssemblyReleaseType.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOMIC_REPRESENTATION.getLocalName())); if (StringUtil.isSet(stringValue)) { setGenomicRepresentation(GenomicRepresentation.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(RELEASE_DATE.getLocalName())); if (StringUtil.isSet(stringValue)) { setReleaseDate(YYYYMMDD_FORMAT.parse(stringValue)); // Ex: 2019/07/30 } setAssemblyName(inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_NAME.getLocalName()))); setSubmitter(inDataRow.getString(inDataRow.getDataColumn(SUBMITTER.getLocalName()))); setGenBankRefSeqPairedAssembly(inDataRow.getString(inDataRow.getDataColumn(GENBANK_REFSEQ_PAIRED_ASSEMBLY.getLocalName()))); stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOMIC_PAIRED_ASSEMBLY_COMPARISON.getLocalName())); if (StringUtil.isSet(stringValue)) { setPairedAssemblyComparison(GenomicPairedAssemblyComparison.valueOf(stringValue)); } setFTP_Path(inDataRow.getString(inDataRow.getDataColumn(FTP_PATH.getLocalName()))); setExcludedFromRefSeq(BooleanUtil.valueOf(inDataRow.getString(inDataRow.getDataColumn(EXCLUDED_FROM_REFSEQ.getLocalName())))); String relationToTypeMaterialString = inDataRow.getString(inDataRow.getDataColumn(GENOMIC_RELATION_TO_TYPE_MATERIAL.getLocalName())); if (StringUtil.isSet(relationToTypeMaterialString)) { setRelationToTypeMaterial(GenomicRelationToTypeMaterial.valueOf(relationToTypeMaterialString)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(ASM_NOT_LIVE_DATE.getLocalName())); if (StringUtil.isSet(stringValue) && ! stringValue.equalsIgnoreCase("na")) { setAssemblyNotLiveDate(YYYYMMDD_FORMAT.parse(stringValue)); // Ex: 2019/07/30 } setAssemblyType(inDataRow.getString(inDataRow.getDataColumn(ASSEMBLY_TYPE.getLocalName()))); setTaxonomyGroup(inDataRow.getString(inDataRow.getDataColumn(GROUP.getLocalName()))); stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOME_SIZE.getLocalName())); if (StringUtil.isSet(stringValue)) { setGenomeSize(Long.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(GENOME_SIZE_UNGAPPED.getLocalName())); if (StringUtil.isSet(stringValue)) { setUngappedGenomeSize(Long.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(GC_PERCENT.getLocalName())); if (StringUtil.isSet(stringValue)) { setGC_Percent(Float.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(REPLICON_COUNT.getLocalName())); if (StringUtil.isSet(stringValue)) { setRepliconCount(Integer.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(SCAFFOLD_COUNT.getLocalName())); if (StringUtil.isSet(stringValue)) { setScaffoldCount(Integer.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(CONTIG_COUNT.getLocalName())); if (StringUtil.isSet(stringValue)) { setContigCount(Integer.valueOf(stringValue)); } setAnnotationProvider(inDataRow.getString(inDataRow.getDataColumn(ANNOTATION_PROVIDER.getLocalName()))); setAnnotationName(inDataRow.getString(inDataRow.getDataColumn(ANNOTATION_NAME.getLocalName()))); stringValue = inDataRow.getString(inDataRow.getDataColumn(ANNOTATION_DATE.getLocalName())); if (StringUtil.isSet(stringValue)) { setAnnotationDate(YYYYMMDD_FORMAT.parse(stringValue)); // Ex: 2019/07/30 } stringValue = inDataRow.getString(inDataRow.getDataColumn(TOTAL_GENE_COUNT.getLocalName())); if (StringUtil.isSet(stringValue) && ! stringValue.equalsIgnoreCase("na")) { setTotalGeneCount(Integer.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(PROTEIN_CODING_GENE_COUNT.getLocalName())); if (StringUtil.isSet(stringValue) && ! stringValue.equalsIgnoreCase("na")) { setProteinCodingGeneCount(Integer.valueOf(stringValue)); } stringValue = inDataRow.getString(inDataRow.getDataColumn(NON_CODING_GENE_COUNT.getLocalName())); if (StringUtil.isSet(stringValue) && ! stringValue.equalsIgnoreCase("na")) { setNonCodingGeneCount(Integer.valueOf(stringValue)); } setPubMedId(inDataRow.getString(inDataRow.getDataColumn(PUBMED_ID.getLocalName()))); } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo(XMLTag inXMLTag) throws ParseException { mDataRow = new DataRow(); inXMLTag.verifyTagName(NCBI_GENOMIC_ASSEMBLY_INFO); XMLTag tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_ACCESSION); setAssemblyAccession(tag.getContent().trim()); tag = inXMLTag.getRequiredSubtagByName(BIOPROJECT_ACCESSION); setBioprojectAccession(tag.getContent().trim()); tag = inXMLTag.getRequiredSubtagByName(BIOSAMPLE_ACCESSION); setBioprojectAccession(tag.getContent().trim()); tag = inXMLTag.getRequiredSubtagByName(WGS_ACCESSION); setWGSAccession(tag.getContent().trim()); tag = inXMLTag.getRequiredSubtagByName(REFSEQ_CATEGORY); setRefSeqCategory(GenomicRefSeqCategory.valueOf(tag.getContent().trim())); tag = inXMLTag.getRequiredSubtagByName(ORGANISM_TAXON); setOrganismTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(tag.getContent().trim()))); tag = inXMLTag.getRequiredSubtagByName(SPECIES_TAXON); setSpeciesTaxon(NCBITaxon.getByTaxonId(Integer.parseInt(tag.getContent().trim()))); tag = inXMLTag.getOptionalSubtagByName(INFRASPECIFIC_NAME); if (tag != null) { setInfraspecificName(tag.getContent().trim()); } tag = inXMLTag.getOptionalSubtagByName(ISOLATE); if (tag != null) { setIsolate(tag.getContent().trim()); } tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_VERSION_STATUS); setAssemblyVersionStatus(GenomicAssemblyVersionStatus.valueOf(tag.getContent().trim())); tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_LEVEL); setAssemblyLevel(GenomicAssemblyLevel.valueOf(tag.getContent().trim())); tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_RELEASE_TYPE); setAssemblyReleaseType(GenomicAssemblyReleaseType.valueOf(tag.getContent().trim())); tag = inXMLTag.getRequiredSubtagByName(GENOMIC_REPRESENTATION); setGenomicRepresentation(GenomicRepresentation.valueOf(tag.getContent().trim())); tag = inXMLTag.getRequiredSubtagByName(RELEASE_DATE); try { setReleaseDate(mDateFormat.parse(tag.getContent().trim())); } catch (ParseException e) { throw new RuntimeException(e); } tag = inXMLTag.getRequiredSubtagByName(ASSEMBLY_NAME); setAssemblyName(tag.getContent().trim()); tag = inXMLTag.getRequiredSubtagByName(SUBMITTER); setSubmitter(tag.getContent().trim()); tag = inXMLTag.getOptionalSubtagByName(GENBANK_REFSEQ_PAIRED_ASSEMBLY); if (tag != null) { setGenBankRefSeqPairedAssembly(tag.getContent().trim()); } tag = inXMLTag.getOptionalSubtagByName(GENOMIC_PAIRED_ASSEMBLY_COMPARISON); if (tag != null) { setPairedAssemblyComparison(GenomicPairedAssemblyComparison.valueOf(tag.getContent().trim())); } tag = inXMLTag.getRequiredSubtagByName(FTP_PATH); setFTP_Path(tag.getContent().trim()); tag = inXMLTag.getRequiredSubtagByName(EXCLUDED_FROM_REFSEQ); setExcludedFromRefSeq(BooleanUtil.valueOf(tag.getContent().trim())); tag = inXMLTag.getOptionalSubtagByName(GENOMIC_RELATION_TO_TYPE_MATERIAL); if (tag != null) { setRelationToTypeMaterial(GenomicRelationToTypeMaterial.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(ASM_NOT_LIVE_DATE); if (tag != null) { setAssemblyNotLiveDate(mDateFormat.parse(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(GROUP); if (tag != null) { setTaxonomyGroup(tag.getContent().trim()); } tag = inXMLTag.getOptionalSubtagByName(GENOME_SIZE); if (tag != null) { setGenomeSize(Long.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(GENOME_SIZE_UNGAPPED); if (tag != null) { setUngappedGenomeSize(Long.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(GC_PERCENT); if (tag != null) { setGC_Percent(Float.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(REPLICON_COUNT); if (tag != null) { setRepliconCount(Integer.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(SCAFFOLD_COUNT); if (tag != null) { setScaffoldCount(Integer.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(CONTIG_COUNT); if (tag != null) { setContigCount(Integer.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(ANNOTATION_PROVIDER); setAnnotationProvider(tag != null ? tag.getContent().trim() : null); tag = inXMLTag.getOptionalSubtagByName(ANNOTATION_NAME); setAnnotationName(tag != null ? tag.getContent().trim() : null); tag = inXMLTag.getOptionalSubtagByName(ANNOTATION_DATE); setAnnotationDate(tag != null ? mDateFormat.parse(tag.getContent().trim()) : null); tag = inXMLTag.getOptionalSubtagByName(TOTAL_GENE_COUNT); if (tag != null) { setTotalGeneCount(Integer.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(PROTEIN_CODING_GENE_COUNT); if (tag != null) { setProteinCodingGeneCount(Integer.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(NON_CODING_GENE_COUNT); if (tag != null) { setNonCodingGeneCount(Integer.valueOf(tag.getContent().trim())); } tag = inXMLTag.getOptionalSubtagByName(PUBMED_ID); setPubMedId(tag != null ? tag.getContent().trim() : null); } //########################################################################## // PUBLIC METHODS //########################################################################## //--------------------------------------------------------------------------- /** * Use for processing an assembly_summary.txt file from the NCBI ftp site. * @param inFile the assembly_summary.txt file * @return a Map with species taxons as the keys * @throws IOException * @throws ParseException */ public static Map> extractInfoFromAssemblySummaryFile(File inFile) throws IOException, ParseException { // Need to pre-process the content so the header doesn't get skipped as a comment. List lines = TSV.parse(new BufferedReader(new FileReader(inFile))); if (lines.get(0)[0].startsWith("##")) { lines.remove(0); // ## See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. } if (lines.get(0)[0].startsWith("#assembly_accession")) { lines.get(0)[0] = lines.get(0)[0].substring(1); } DataTable dataTable = new DataTable(lines); Map> assemblyMap = new HashMap<>(dataTable.rowCount()); for (DataRow dataRow : dataTable.getRows()) { NCBIGenomicAssemblyInfo assemblyInfo = new NCBIGenomicAssemblyInfo(dataRow); List infoList = assemblyMap.get(assemblyInfo.getSpeciesTaxon()); if (null == infoList) { infoList = new ArrayList<>(2); assemblyMap.put(assemblyInfo.getSpeciesTaxon(), infoList); } infoList.add(assemblyInfo); } return assemblyMap; } //--------------------------------------------------------------------------- @Override public String toString() { StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("\n") .delimitedAppend("assembly_accession: " + getAssemblyAccession()) .delimitedAppend("bioproject: " + getBioprojectAccession()) .delimitedAppend("biosample: " + getBiosampleAccession()) .delimitedAppend("wgs_master: " + getWGSAccession()) .delimitedAppend("refseq_category: " + getRefSeqCategory()) .delimitedAppend("taxid: " + getOrganismTaxon().getTaxonId()) .delimitedAppend("species_taxid: " + getSpeciesTaxon().getTaxonId()) .delimitedAppend("organism_name: " + getSpeciesTaxon().getScientificName()) .delimitedAppend("infraspecific_name: " + getInfraspecificName()) .delimitedAppend("isolate: " + getIsolate()) .delimitedAppend("version_status: " + getAssemblyVersionStatus()) .delimitedAppend("assembly_level: " + getAssemblyLevel()) .delimitedAppend("release_type: " + getAssemblyReleaseType()) .delimitedAppend("genome_rep: " + getGenomicRepresentation()) .delimitedAppend("seq_rel_date: " + getReleaseDate()) .delimitedAppend("asm_name: " + getAssemblyName()) .delimitedAppend("submitter: " + getSubmitter()) .delimitedAppend("gbrs_paired_asm: " + getGenBankRefSeqPairedAssembly()) .delimitedAppend("paired_asm_comp: " + getPairedAssemblyComparison()) .delimitedAppend("ftp_path: " + getFTP_Path()) .delimitedAppend("excluded_from_refseq: " + getExcludedFromRefSeq()) .delimitedAppend("relation_to_type_material: " + (getRelationToTypeMaterial() != null ? getRelationToTypeMaterial() : "")) .delimitedAppend("asm_not_live_date: " + getAssemblyNotLiveDate()) .delimitedAppend("assembly_type: " + getAssemblyType()) .delimitedAppend(GROUP + ": " + getTaxonomyGroup()) .delimitedAppend(GENOME_SIZE + ": " + getGenomeSize()) .delimitedAppend(GENOME_SIZE_UNGAPPED + ": " + getUngappedGenomeSize()) .delimitedAppend(GC_PERCENT + ": " + getGC_Percent()) .delimitedAppend(REPLICON_COUNT + ": " + getRepliconCount()) .delimitedAppend("pubmed_id: " + getPubMedId()) ; return buffer.toString(); } //--------------------------------------------------------------------------- public void toTSV(File inFile) throws Exception { if (mDataRow != null) { mDataRow.toTSV(inFile); } } //--------------------------------------------------------------------------- public XMLTag toXMLTag() { XMLTag rootTag = new XMLTag(NCBI_GENOMIC_ASSEMBLY_INFO); rootTag.addSubtag(ASSEMBLY_ACCESSION).setContent(getAssemblyAccession()); rootTag.addSubtag(BIOPROJECT_ACCESSION).setContent(getBioprojectAccession()); rootTag.addSubtag(BIOSAMPLE_ACCESSION).setContent(getBiosampleAccession()); rootTag.addSubtag(WGS_ACCESSION).setContent(getWGSAccession()); rootTag.addSubtag(REFSEQ_CATEGORY).setContent(getRefSeqCategory().name()); rootTag.addSubtag(ORGANISM_TAXON).setContent(getOrganismTaxon().getTaxonId()); rootTag.addSubtag(SPECIES_TAXON).setContent(getSpeciesTaxon().getTaxonId()); if (getInfraspecificName() != null) { rootTag.addSubtag(INFRASPECIFIC_NAME).setContent(getInfraspecificName()); } if (getIsolate() != null) { rootTag.addSubtag(ISOLATE).setContent(getIsolate()); } rootTag.addSubtag(ASSEMBLY_VERSION_STATUS).setContent(getAssemblyVersionStatus().name()); rootTag.addSubtag(ASSEMBLY_LEVEL).setContent(getAssemblyLevel().name()); rootTag.addSubtag(ASSEMBLY_RELEASE_TYPE).setContent(getAssemblyReleaseType().name()); rootTag.addSubtag(GENOMIC_REPRESENTATION).setContent(getGenomicRepresentation().name()); rootTag.addSubtag(RELEASE_DATE).setContent(mDateFormat.format(getReleaseDate())); rootTag.addSubtag(ASSEMBLY_NAME).setContent(getAssemblyName()); rootTag.addSubtag(SUBMITTER).setContent(getSubmitter()); if (getGenBankRefSeqPairedAssembly() != null) { rootTag.addSubtag(GENBANK_REFSEQ_PAIRED_ASSEMBLY).setContent(getGenBankRefSeqPairedAssembly()); } if (getPairedAssemblyComparison() != null) { rootTag.addSubtag(GENOMIC_PAIRED_ASSEMBLY_COMPARISON).setContent(getPairedAssemblyComparison().name()); } rootTag.addSubtag(FTP_PATH).setContent(getFTP_Path()); rootTag.addSubtag(EXCLUDED_FROM_REFSEQ).setContent(getExcludedFromRefSeq() ? "true" : "false"); if (getRelationToTypeMaterial() != null) { rootTag.addSubtag(GENOMIC_RELATION_TO_TYPE_MATERIAL).setContent(getRelationToTypeMaterial().name()); } if (getAssemblyType() != null) { rootTag.addSubtag(ASSEMBLY_TYPE).setContent(getAssemblyType()); } if (getTaxonomyGroup() != null) { rootTag.addSubtag(GROUP).setContent(getTaxonomyGroup()); } if (getGenomeSize() != null) { rootTag.addSubtag(GENOME_SIZE).setContent(getGenomeSize() + ""); } if (getUngappedGenomeSize() != null) { rootTag.addSubtag(GENOME_SIZE_UNGAPPED).setContent(getUngappedGenomeSize() + ""); } if (getGC_Percent() != null) { rootTag.addSubtag(GC_PERCENT).setContent(getGC_Percent() + ""); } if (getRepliconCount() != null) { rootTag.addSubtag(REPLICON_COUNT).setContent(getRepliconCount() + ""); } if (getScaffoldCount() != null) { rootTag.addSubtag(SCAFFOLD_COUNT).setContent(getScaffoldCount() + ""); } if (getContigCount() != null) { rootTag.addSubtag(CONTIG_COUNT).setContent(getContigCount() + ""); } if (getAnnotationProvider() != null) { rootTag.addSubtag(ANNOTATION_PROVIDER).setContent(getAnnotationProvider()); } if (getAnnotationName() != null) { rootTag.addSubtag(ANNOTATION_NAME).setContent(getAnnotationName()); } if (getAnnotationDate() != null) { rootTag.addSubtag(ANNOTATION_DATE).setContent(mDateFormat.format(getAnnotationDate())); } if (getTotalGeneCount() != null) { rootTag.addSubtag(TOTAL_GENE_COUNT).setContent(getTotalGeneCount() + ""); } if (getProteinCodingGeneCount() != null) { rootTag.addSubtag(PROTEIN_CODING_GENE_COUNT).setContent(getProteinCodingGeneCount() + ""); } if (getNonCodingGeneCount() != null) { rootTag.addSubtag(NON_CODING_GENE_COUNT).setContent(getNonCodingGeneCount() + ""); } if (getPubMedId() != null) { rootTag.addSubtag(PUBMED_ID).setContent(getPubMedId()); } return rootTag; } //--------------------------------------------------------------------------- /** * Generic field retrieval to future-proof against future changes by the NCBI. * @param inName the column name * @return the String value */ public String getField(String inName) { return mDataRow != null ? mDataRow.getString(mDataRow.getDataColumn(inName)) : null; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyAccession(String inValue) { mAssemblyAccession = inValue; return this; } //--------------------------------------------------------------------------- public String getAssemblyAccession() { return mAssemblyAccession; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setBioprojectAccession(String inValue) { mBioprojectAccession = inValue; return this; } //--------------------------------------------------------------------------- public String getBioprojectAccession() { return mBioprojectAccession; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setBiosampleAccession(String inValue) { mBiosampleAccession = inValue; return this; } //--------------------------------------------------------------------------- public String getBiosampleAccession() { return mBiosampleAccession; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setWGSAccession(String inValue) { mWGSAccession = inValue; return this; } //--------------------------------------------------------------------------- public String getWGSAccession() { return mWGSAccession; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setRefSeqCategory(GenomicRefSeqCategory inValue) { mRefseqCategory = inValue; return this; } //--------------------------------------------------------------------------- public GenomicRefSeqCategory getRefSeqCategory() { return mRefseqCategory; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setOrganismTaxon(NCBITaxon inValue) { mOrganismNCBITaxon = inValue; return this; } //--------------------------------------------------------------------------- public NCBITaxon getOrganismTaxon() { return mOrganismNCBITaxon; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setSpeciesTaxon(NCBITaxon inValue) { mSpeciesNCBITaxon = inValue; return this; } //--------------------------------------------------------------------------- public NCBITaxon getSpeciesTaxon() { return mSpeciesNCBITaxon; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setInfraspecificName(String inValue) { mInfraspecificName = inValue; return this; } //--------------------------------------------------------------------------- public String getInfraspecificName() { return mInfraspecificName; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setIsolate(String inValue) { mIsolate = inValue; return this; } //--------------------------------------------------------------------------- public String getIsolate() { return mIsolate; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyVersionStatus(GenomicAssemblyVersionStatus inValue) { mAssemblyVersionStatus = inValue; return this; } //--------------------------------------------------------------------------- public GenomicAssemblyVersionStatus getAssemblyVersionStatus() { return mAssemblyVersionStatus; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyLevel(GenomicAssemblyLevel inValue) { mAssemblyLevel = inValue; return this; } //--------------------------------------------------------------------------- public GenomicAssemblyLevel getAssemblyLevel() { return mAssemblyLevel; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyReleaseType(GenomicAssemblyReleaseType inValue) { mAssemblyReleaseType = inValue; return this; } //--------------------------------------------------------------------------- public GenomicAssemblyReleaseType getAssemblyReleaseType() { return mAssemblyReleaseType; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setGenomicRepresentation(GenomicRepresentation inValue) { mGenomicRepresentation = inValue; return this; } //--------------------------------------------------------------------------- public GenomicRepresentation getGenomicRepresentation() { return mGenomicRepresentation; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setReleaseDate(Date inValue) { mReleaseDate = inValue; return this; } //--------------------------------------------------------------------------- public Date getReleaseDate() { return mReleaseDate; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyName(String inValue) { mAssemblyName = inValue; return this; } //--------------------------------------------------------------------------- public String getAssemblyName() { return mAssemblyName; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setSubmitter(String inValue) { mSubmitter = inValue; return this; } //--------------------------------------------------------------------------- public String getSubmitter() { return mSubmitter; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setGenBankRefSeqPairedAssembly(String inValue) { mGenBankRefSeqPairedAssembly = inValue; return this; } //--------------------------------------------------------------------------- public String getGenBankRefSeqPairedAssembly() { return mGenBankRefSeqPairedAssembly; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setPairedAssemblyComparison(GenomicPairedAssemblyComparison inValue) { mGenomicPairedAssemblyComparison = inValue; return this; } //--------------------------------------------------------------------------- public GenomicPairedAssemblyComparison getPairedAssemblyComparison() { return mGenomicPairedAssemblyComparison; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setFTP_Path(String inValue) { mFTP_Path = inValue; return this; } //--------------------------------------------------------------------------- public String getFTP_Path() { return mFTP_Path; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setExcludedFromRefSeq(boolean inValue) { mExcludedFromRefSeq = inValue; return this; } //--------------------------------------------------------------------------- public boolean getExcludedFromRefSeq() { return mExcludedFromRefSeq; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setRelationToTypeMaterial(GenomicRelationToTypeMaterial inValue) { mGenomicRelationToTypeMaterial = inValue; return this; } //--------------------------------------------------------------------------- public GenomicRelationToTypeMaterial getRelationToTypeMaterial() { return mGenomicRelationToTypeMaterial; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyNotLiveDate(Date inValue) { mAssemblyNotLiveDate = inValue; return this; } //--------------------------------------------------------------------------- public Date getAssemblyNotLiveDate() { return mAssemblyNotLiveDate; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAssemblyType(String inValue) { mAssemblyType = inValue; return this; } //--------------------------------------------------------------------------- public String getAssemblyType() { return mAssemblyType; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setTaxonomyGroup(String inValue) { mTaxonomyGroup = inValue; return this; } //--------------------------------------------------------------------------- public String getTaxonomyGroup() { return mTaxonomyGroup; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setGenomeSize(Long inValue) { mGenomeSize = inValue; return this; } //--------------------------------------------------------------------------- public Long getGenomeSize() { return mGenomeSize; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setUngappedGenomeSize(Long inValue) { mUngappedGenomeSize = inValue; return this; } //--------------------------------------------------------------------------- public Long getUngappedGenomeSize() { return mUngappedGenomeSize; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setGC_Percent(Float inValue) { mGCPercent = inValue; return this; } //--------------------------------------------------------------------------- public Float getGC_Percent() { return mGCPercent; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setRepliconCount(Integer inValue) { mRepliconCount = inValue; return this; } //--------------------------------------------------------------------------- public Integer getRepliconCount() { return mRepliconCount; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setScaffoldCount(Integer inValue) { mScaffoldCount = inValue; return this; } //--------------------------------------------------------------------------- public Integer getScaffoldCount() { return mScaffoldCount; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setContigCount(Integer inValue) { mContigCount = inValue; return this; } //--------------------------------------------------------------------------- public Integer getContigCount() { return mContigCount; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAnnotationProvider(String inValue) { mAnnotationProvider = inValue; return this; } //--------------------------------------------------------------------------- public String getAnnotationProvider() { return mAnnotationProvider; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAnnotationName(String inValue) { mAnnotationName = inValue; return this; } //--------------------------------------------------------------------------- public String getAnnotationName() { return mAnnotationName; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setAnnotationDate(Date inValue) { mAnnotationDate = inValue; return this; } //--------------------------------------------------------------------------- public Date getAnnotationDate() { return mAnnotationDate; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setTotalGeneCount(Integer inValue) { mTotalGeneCount = inValue; return this; } //--------------------------------------------------------------------------- public Integer getTotalGeneCount() { return mTotalGeneCount; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setProteinCodingGeneCount(Integer inValue) { mProteinCodingGeneCount = inValue; return this; } //--------------------------------------------------------------------------- public Integer getProteinCodingGeneCount() { return mProteinCodingGeneCount; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setNonCodingGeneCount(Integer inValue) { mNonCodingGeneCount = inValue; return this; } //--------------------------------------------------------------------------- public Integer getNonCodingGeneCount() { return mNonCodingGeneCount; } //--------------------------------------------------------------------------- public NCBIGenomicAssemblyInfo setPubMedId(String inValue) { mPubMedId = inValue; return this; } //--------------------------------------------------------------------------- public String getPubMedId() { return mPubMedId; } //--------------------------------------------------------------------------- /** * Compares this object with the specified object for order. Returns a * negative integer, zero, or a positive integer as this object is less * than, equal to, or greater than the specified object. * *

The implementor must ensure * {@code sgn(x.compareTo(y)) == -sgn(y.compareTo(x))} * for all {@code x} and {@code y}. (This * implies that {@code x.compareTo(y)} must throw an exception iff * {@code y.compareTo(x)} throws an exception.) * *

The implementor must also ensure that the relation is transitive: * {@code (x.compareTo(y) > 0 && y.compareTo(z) > 0)} implies * {@code x.compareTo(z) > 0}. * *

Finally, the implementor must ensure that {@code x.compareTo(y)==0} * implies that {@code sgn(x.compareTo(z)) == sgn(y.compareTo(z))}, for * all {@code z}. * *

It is strongly recommended, but not strictly required that * {@code (x.compareTo(y)==0) == (x.equals(y))}. Generally speaking, any * class that implements the {@code Comparable} interface and violates * this condition should clearly indicate this fact. The recommended * language is "Note: this class has a natural ordering that is * inconsistent with equals." * *

In the foregoing description, the notation * {@code sgn(}expression{@code )} designates the mathematical * signum function, which is defined to return one of {@code -1}, * {@code 0}, or {@code 1} according to whether the value of * expression is negative, zero, or positive, respectively. * * @param inObj2 the object to be compared. * @return a negative integer, zero, or a positive integer as this object * is less than, equal to, or greater than the specified object. * @throws ClassCastException if the specified object's type prevents it * from being compared to this object. */ @Override public int compareTo(NCBIGenomicAssemblyInfo inObj2) { int result = -1; if (inObj2 != null) { result = CompareUtil.compare(getSpeciesTaxon(), inObj2.getSpeciesTaxon()); if (0 == result) { result = CompareUtil.compare(getOrganismTaxon(), inObj2.getOrganismTaxon()); if (0 == result) { result = CompareUtil.compare(getRefSeqCategory(), inObj2.getRefSeqCategory()); if (0 == result) { result = CompareUtil.compare(getGenomicRepresentation(), inObj2.getGenomicRepresentation()); if (0 == result) { result = CompareUtil.compare(getAssemblyReleaseType(), inObj2.getAssemblyReleaseType()); if (0 == result) { result = CompareUtil.compare(getAssemblyVersionStatus(), inObj2.getAssemblyVersionStatus()); } } } } } } return result; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy