All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.format.EMBL Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.format;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.DbXref;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.bio.seq.Clone;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.bio.seq.SeqTopology;
import com.hfg.bio.seq.format.feature.FeatureQualifier;
import com.hfg.bio.seq.format.feature.SeqFeature;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeature;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureKey;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureLocation;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifier;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifierName;
import com.hfg.bio.seq.format.feature.qualifier.MolType;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeature;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureKey;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureLocation;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifier;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifierName;
import com.hfg.bio.taxonomy.EMBL_TaxonDivision;
import com.hfg.bio.taxonomy.NCBITaxon;
import com.hfg.citation.Author;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.io.LettersOnlyReader;

//------------------------------------------------------------------------------
/**
 EMBL sequence format.
 
See ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
See http://web.expasy.org/docs/userman.html for info on the Uniprot format variant.
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ /* Example record from ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt : ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. XX AC X56734; S46826; XX DT 12-SEP-1991 (Rel. 29, Created) DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) XX DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase XX KW beta-glucosidase. XX OS Trifolium repens (white clover) OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. XX RN [5] RP 1-1859 RX DOI; 10.1007/BF00039495. RX PUBMED; 1907511. RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; RT "Nucleotide and derived amino acid sequence of the cyanogenic RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; RL Plant Mol. Biol. 17(2):209-219(1991). XX RN [6] RP 1-1859 RA Hughes M.A.; RT ; RL Submitted (19-NOV-1990) to the INSDC. RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle RL Upon Tyne, NE2 4HH, UK XX DR EuropePMC; PMC99098; 11752244. XX FH Key Location/Qualifiers FH FT source 1..1859 FT /organism="Trifolium repens" FT /mol_type="mRNA" FT /clone_lib="lambda gt10" FT /clone="TRE361" FT /tissue_type="leaves" FT /db_xref="taxon:3899" FT mRNA 1..1859 FT /experiment="experimental evidence, no additional details FT recorded" FT CDS 14..1495 FT /product="beta-glucosidase" FT /EC_number="3.2.1.21" FT /note="non-cyanogenic" FT /db_xref="GOA:P26204" FT /db_xref="InterPro:IPR001360" FT /db_xref="InterPro:IPR013781" FT /db_xref="InterPro:IPR017853" FT /db_xref="InterPro:IPR018120" FT /db_xref="UniProtKB/Swiss-Prot:P26204" FT /protein_id="CAA40058.1" FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" XX SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 // Figure 1 - A sample entry from the database */ public class EMBL extends ReadableSeqFormatBase { // Variables used during parsing private T mCurrentSeq; private FormatVariant mFormatVariant; private String mCurrentLineCode; private GenBankFeature mCurrentGenBankFeature; private GenBankFeatureQualifier mCurrentGenBankFeatureQualifier; private UniProtFeature mCurrentUniProtFeature; private UniProtFeatureQualifier mCurrentUniProtFeatureQualifier; private SeqCitation mCurrentCitation; private Integer mSeqLengthFromIdLine; private String mCurrentKeywords; // Valid line codes public static final String AC_LINE_CODE = "AC"; // AC - accession number (>=1 per entry) public static final String AH_LINE_CODE = "AH"; // AH - assembly header (0 or 1 per entry) public static final String AS_LINE_CODE = "AS"; // AS - assembly information (0 or >=1 per entry) public static final String CC_LINE_CODE = "CC"; // CC - comments or notes (>=0 per entry) public static final String CO_LINE_CODE = "CO"; // CO - contig/construct line (0 or >=1 per entry) public static final String DE_LINE_CODE = "DE"; // DE - description (>=1 per entry) public static final String DR_LINE_CODE = "DR"; // DR - database cross-reference (>=0 per entry) public static final String DT_LINE_CODE = "DT"; // DT - date (2 per entry) public static final String FH_LINE_CODE = "FH"; // FH - feature table header (2 per entry) public static final String FT_LINE_CODE = "FT"; // FT - feature table data (>=2 per entry) public static final String ID_LINE_CODE = "ID"; // ID - identification (begins each entry; 1 per entry) public static final String KW_LINE_CODE = "KW"; // KW - keyword (>=1 per entry) public static final String OC_LINE_CODE = "OC"; // OC - organism classification (>=1 per entry) public static final String OG_LINE_CODE = "OG"; // OG - organelle (0 or 1 per entry) public static final String OS_LINE_CODE = "OS"; // OS - organism species (>=1 per entry) public static final String PR_LINE_CODE = "PR"; // PR - project identifier (0 or 1 per entry) public static final String RA_LINE_CODE = "RA"; // RA - reference author(s) (>=0 per entry) public static final String RC_LINE_CODE = "RC"; // RC - reference comment (>=0 per entry) public static final String RG_LINE_CODE = "RG"; // RG - reference group (>=0 per entry) public static final String RL_LINE_CODE = "RL"; // RL - reference location (>=1 per entry) public static final String RN_LINE_CODE = "RN"; // RN - reference number (>=1 per entry) public static final String RP_LINE_CODE = "RP"; // RP - reference positions (>=1 per entry) public static final String RT_LINE_CODE = "RT"; // RT - reference title (>=1 per entry) public static final String RX_LINE_CODE = "RX"; // RX - reference cross-reference (>=0 per entry) public static final String SQ_LINE_CODE = "SQ"; // SQ - sequence header (1 per entry) // EMBL-specific line codes public static final String XX_LINE_CODE = "XX"; // XX - spacer line (many per entry) // UniProt-specific line codes public static final String GN_LINE_CODE = "GN"; // public static final String OH_LINE_CODE = "OH"; // Organism host taxonomy cross-reference public static final String OX_LINE_CODE = "OX"; // Organism taxonomy cross-reference public static final String PE_LINE_CODE = "PE"; // // Attributes populated into the sequence object public static final String CLONE_ATTR = "Clone"; public static final String EMBL_DATA_CLASS_ATTR = "EMBL Data Class"; public static final String SPECIES_SCIENTIFIC_NAME_ATTR = "Species Scientific Name"; public static final String ORGANISM_CLASSIFICATION_ATTR = "Organism Classification"; public static final String ORGANISM_NCBI_TAXON_ID_ATTR = "Organism NCBI Taxon ID"; enum FormatVariant { EMBL, UniProt } private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName()); static { LOGGER.setLevel(Level.WARNING); LOGGER.setUseParentHandlers(true); } //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- public EMBL(BioSequenceFactory inSeqFactory) { super(inSeqFactory); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public static Logger getLogger() { return LOGGER; } //--------------------------------------------------------------------------- public boolean isEndOfRecord(String inLine) { return inLine.trim().equals("//"); } //--------------------------------------------------------------------------- public boolean hasJanusDelimiter() { return false; } //--------------------------------------------------------------------------- public T readRecord(BufferedReader inReader) throws SeqIOException { initRecordParsing(); boolean idLineFound = false; try { mCurrentSeq = getBioSequenceFactory().createSeqObj(); String line; while ((line = inReader.readLine()) != null) { if (! idLineFound) { String lineCode = line.substring(0, 2); if (ID_LINE_CODE.equals(lineCode)) { idLineFound = true; } else { continue; } } else if (isEndOfRecord(line)) { break; } if (idLineFound && StringUtil.isSet(line)) { parseLine(line); if (SQ_LINE_CODE.equals(mCurrentLineCode)) { break; } } } // The rest of the record is assumed to be sequence // Cleanup the sequence to remove spaces and numbers Reader filterReader = new SeqFilterReader(inReader); mCurrentSeq.setSequence(filterReader); filterReader.close(); // Cleanup if (mCurrentGenBankFeatureQualifier != null) { // The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES if (mCurrentGenBankFeatureQualifier.getValue().startsWith("\"")) { mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue())); } mCurrentGenBankFeatureQualifier = null; if (mCurrentSeq instanceof BioSequencePlus) { List sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source); if (CollectionUtil.hasValues(sourceFeatures)) { SeqFeature source = sourceFeatures.get(0); List cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name()); if (CollectionUtil.hasValues(cloneQualifiers)) { Clone clone = new Clone(cloneQualifiers.get(0).getValue()); List subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name()); if (CollectionUtil.hasValues(subcloneQualifiers)) { clone.setSubcloneName(subcloneQualifiers.get(0).getValue()); } mCurrentSeq.setAttribute(CLONE_ATTR, clone); } } } } // TODO: Clean the '.' off the end of feature descriptions } catch (Exception e) { throw new SeqIOException("Problem parsing EMBL record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e); } if (!idLineFound) { throw new SeqIOException("No " + ID_LINE_CODE + " line detected in the EMBL record!"); } return mCurrentSeq; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private void initRecordParsing() { mCurrentSeq = null; mCurrentLineCode = null; mCurrentUniProtFeature = null; mCurrentCitation = null; mCurrentKeywords = ""; } //--------------------------------------------------------------------------- private void parseLine(String inLine) { mCurrentLineCode = inLine.substring(0, 2); switch (mCurrentLineCode) { case XX_LINE_CODE: // Blank line break; case CC_LINE_CODE: // Comment line break; case ID_LINE_CODE: parseID(inLine); break; case AC_LINE_CODE: // Accessions parseAC(inLine); break; case DT_LINE_CODE: // Date //TODO: parseDT(inLine); break; case DE_LINE_CODE: // Description parseDE(inLine); break; case KW_LINE_CODE: // Keywords parseKW(inLine); break; case OS_LINE_CODE: // Organism species parseOS(inLine); break; case OC_LINE_CODE: // Organism classification parseOC(inLine); break; case RN_LINE_CODE: // Reference number (start of a new reference) mCurrentCitation = new SeqCitation(); if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addReference(mCurrentCitation); } break; case RA_LINE_CODE: // Reference author(s) parseRA(inLine); break; case RT_LINE_CODE: // Reference title parseRT(inLine); break; case RX_LINE_CODE: // Reference cross-reference parseRX(inLine); break; case RP_LINE_CODE: // Reference positions parseRP(inLine); break; case RL_LINE_CODE: // Reference location parseRL(inLine); break; case RC_LINE_CODE: // Reference comment //TODO break; case DR_LINE_CODE: // Database cross-reference parseDR(inLine); break; case AH_LINE_CODE: case AS_LINE_CODE: // Assembly info // TODO break; case FH_LINE_CODE: // Feature table header. Ignore break; case FT_LINE_CODE: // Features parseFT(inLine); break; case SQ_LINE_CODE: // Sequence data // TODO break; case GN_LINE_CODE: if (mFormatVariant != FormatVariant.UniProt) { throw new SeqIOException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!"); } break; case OX_LINE_CODE: if (mFormatVariant != FormatVariant.UniProt) { throw new SeqIOException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!"); } parseOX(inLine); break; case PE_LINE_CODE: if (mFormatVariant != FormatVariant.UniProt) { throw new SeqIOException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!"); } break; default: throw new SeqIOException("Unrecognized line code: " + StringUtil.quote(mCurrentLineCode) + "!"); } /* GenBankKeyword keyword = getLineKeyword(inLine); if (keyword != null) { // Found the start of a new keyword field finishPreviousKeyword(); mCurrentLineCode = keyword; mCurrentSubkeyword = null; parseField(inLine); } else { // Continuation of an existing field if (GenBankKeyword.FEATURES.equals(mCurrentLineCode)) { // Features have a special set of feature keys parseFeatures(inLine); } else { mCurrentSubkeyword = getLineSubkeyword(inLine); if (mCurrentSubkeyword != null) { // Start of a new subfield } else { // Continuation of an existing subfield } parseField(inLine); } } */ } //--------------------------------------------------------------------------- // Parse the ID line // Ex #1: ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. // Ex #2: ID FMT_ANADE Reviewed; 312 AA. // // The ID (IDentification) line is always the first line of an entry. The // format of the ID line is: // ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. // The tokens represent: // 1. Primary accession number // 2. Sequence version number // 3. Topology: 'circular' or 'linear' // 4. Molecule type (see note 1 below) // 5. Data class (see section 3.1) // 6. Taxonomic division (see section 3.2) // 7. Sequence length (see note 2 below) private static final Pattern sEMBL_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+);\\s+SV\\s+(\\w+);\\s+(\\w+);\\s+(.+?);\\s+(\\w+);\\s+(\\w+);\\s+(\\w+) BP."); private static final Pattern sUniProt_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+)\\s+(?:Reviewed|Unreviewd);\\s+(\\w+) AA."); private static final Pattern sSeqLengthPattern = Pattern.compile("(\\w+) (AA|BP)."); private void parseID(String inLine) { Matcher m = sEMBL_ID_Pattern1.matcher(inLine); if (m.matches()) { mFormatVariant = FormatVariant.EMBL; if (StringUtil.isSet(m.group(1))) { mCurrentSeq.setID(m.group(1) + '.' + m.group(2)); } if (mCurrentSeq instanceof BioSequencePlus) { if (m.group(3) != null) { ((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(3))); } if (m.group(4) != null) { ((BioSequencePlus) mCurrentSeq).setMolType(MolType.retrieveOrCreateValueOf(m.group(4))); } mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, EMBL_DataClass.valueOf(m.group(5))); ((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(EMBL_TaxonDivision.valueOf(m.group(6))); } mSeqLengthFromIdLine = Integer.parseInt(m.group(7)); } else { m = sUniProt_ID_Pattern1.matcher(inLine); if (m.matches()) { mFormatVariant = FormatVariant.UniProt; } else { // throw new SeqIOException("The " + ID_LINE_CODE + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!"); // Non-standard ID line format. Do the best we can. String[] pieces = inLine.substring(2).split(";"); int pieceIndex = 1; String id = pieces[0].trim().split("\\s+")[0].trim(); if (pieces.length > 1 && pieces[1].trim().startsWith("SV")) { id += "." + pieces[1].trim().substring(2).trim(); pieceIndex++; } mCurrentSeq.setID(id); if (mCurrentSeq instanceof BioSequencePlus) { BioSequencePlus bioSequencePlus = (BioSequencePlus) mCurrentSeq; boolean topologyFound = false; boolean molTypeFound = false; boolean dataClassFound = false; boolean divisionFound = false; for (; pieceIndex < pieces.length; pieceIndex++) { String field = pieces[pieceIndex].trim(); if (! topologyFound) { SeqTopology seqTopology = SeqTopology.valueOf(field); if (seqTopology != null) { bioSequencePlus.setSeqTopology(seqTopology); topologyFound = true; continue; } } if (! molTypeFound) { MolType molType = MolType.valueOf(field); if (molType != null) { bioSequencePlus.setMolType(MolType.valueOf(field)); molTypeFound = true; continue; } } if (! dataClassFound) { EMBL_DataClass dataClass = EMBL_DataClass.valueOf(field); if (dataClass != null) { mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, dataClass); dataClassFound = true; continue; } } if (! divisionFound) { EMBL_TaxonDivision div = EMBL_TaxonDivision.valueOf(field); if (div != null) { bioSequencePlus.setSeqRepositoryDivision(div); divisionFound = true; continue; } } if (pieceIndex == pieces.length - 1) { Matcher seqLengthMatcher = sSeqLengthPattern.matcher(field); if (seqLengthMatcher.matches()) { mSeqLengthFromIdLine = Integer.parseInt(seqLengthMatcher.group(1)); if (seqLengthMatcher.group(2).equalsIgnoreCase("BP")) { mFormatVariant = FormatVariant.EMBL; } else { mFormatVariant = FormatVariant.UniProt; } } } } } } } } //--------------------------------------------------------------------------- private void parseAC(String inLine) { String[] pieces = inLine.substring(2).split(";\\s*"); if (null == mCurrentSeq.getID()) { mCurrentSeq.setID(pieces[0].trim()); } // TODO: handle additional ids } //--------------------------------------------------------------------------- private void parseDE(String inLine) { String description = inLine.substring(2).trim(); if (StringUtil.isSet(mCurrentSeq.getDescription())) { description = mCurrentSeq.getDescription() + " " + description; } mCurrentSeq.setDescription(description); } //--------------------------------------------------------------------------- // Parse keywords private void parseKW(String inLine) { String keywordString = inLine.substring(2).trim(); if (mCurrentSeq instanceof BioSequencePlus) { mCurrentKeywords += keywordString; if (mCurrentKeywords.endsWith(".")) { String[] keywords = mCurrentKeywords.split("[;\\.]"); for (String keyword : keywords) { if (StringUtil.isSet(keyword)) { ((BioSequencePlus) mCurrentSeq).addKeyword(keyword.trim()); } } } } } //--------------------------------------------------------------------------- // Parse the reference authors private void parseRA(String inLine) { String[] authors = null; if (mFormatVariant.equals(FormatVariant.EMBL)) { authors = inLine.substring(2).split(";\\s*"); } else if (mFormatVariant.equals(FormatVariant.UniProt)) { authors = inLine.substring(2).split(",\\s*"); } if (authors != null) { for (String author : authors) { if (StringUtil.isSet(author)) { mCurrentCitation.addAuthor(new Author(author.trim())); } } } } //--------------------------------------------------------------------------- // Parse the reference title private void parseRT(String inLine) { String title = inLine.substring(2).trim(); if (title.endsWith(";")) { title = title.substring(0, title.length() - 1); } if (StringUtil.isSet(title)) { if (mCurrentCitation.getTitle() != null) { title = mCurrentCitation.getTitle() + " " + title; } if (StringUtil.isQuoted(title)) { title = StringUtil.unquote(title); } mCurrentCitation.setTitle(title); } } //--------------------------------------------------------------------------- private void parseRX(String inLine) { String xref = inLine.substring(2).trim(); if (xref.endsWith(".")) { xref = xref.substring(0, xref.length() - 1); } String[] pieces = xref.split(";\\s*"); if (pieces[0].equals("PUBMED")) { mCurrentCitation.setPubMedId(pieces[1]); } // TODO: handle other x-refs } //--------------------------------------------------------------------------- // Parse reference position value private void parseRP(String inLine) { String positionString = inLine.substring(2).trim(); if (StringUtil.isSet(positionString)) { String[] positionStrings = positionString.split("\\s*-\\s*"); if (2 == positionStrings.length) { mCurrentCitation.setSeqLocation(new SeqLocation().setStart(Integer.parseInt(positionStrings[0].trim())).setEnd(Integer.parseInt(positionStrings[1].trim()))); } } } //--------------------------------------------------------------------------- // Parse reference location private void parseRL(String inLine) { String stringValue = inLine.substring(2).trim(); if (StringUtil.isSet(stringValue)) { mCurrentCitation.appendRawContent(stringValue); } } //--------------------------------------------------------------------------- private void parseOS(String inLine) { String text = inLine.substring(2).trim(); if (null == mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR)) { mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, text); } else { mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR) + " " + text); } } //--------------------------------------------------------------------------- private void parseOC(String inLine) { String text = inLine.substring(2).trim(); if (null == mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR)) { mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, text); } else { mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR) + " " + text); } } //--------------------------------------------------------------------------- // Parse organism taxonomy cross-reference // OX Taxonomy_database_Qualifier=Taxonomic code; // Example: // OX NCBI_TaxID=9606; private void parseOX(String inLine) { String[] pieces = inLine.substring(2).trim().split("="); // Remove trailing ';' if (pieces[1].endsWith(";")) { pieces[1] = pieces[1].substring(0, pieces[1].length() - 1); } mCurrentSeq.setAttribute(ORGANISM_NCBI_TAXON_ID_ATTR, Integer.parseInt(pieces[1])); if (mCurrentSeq instanceof BioSequencePlus) { NCBITaxon taxon = NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1])); if (taxon != null) { ((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon); } } } //--------------------------------------------------------------------------- // Parse database cross-reference // The format of the DR line is: // DR RESOURCE_ABBREVIATION; RESOURCE_IDENTIFIER; OPTIONAL_INFORMATION_1[; OPTIONAL_INFORMATION_2][; OPTIONAL_INFORMATION_3]. // Example: // DR EMBL; U29082; AAA68403.1; -; Genomic_DNA. private void parseDR(String inLine) { String[] pieces = inLine.substring(2).trim().split(";\\s*"); // Remove trailing '.' from the last piece if (pieces[pieces.length - 1].endsWith(".")) { pieces[pieces.length - 1] = pieces[pieces.length - 1].substring(0, pieces[pieces.length - 1].length() - 1); } DbXref xref = new DbXref(pieces[0], pieces[1]); if (pieces.length > 2) { StringBuilderPlus description = new StringBuilderPlus().setDelimiter("; "); for (int i = 2; i < pieces.length; i++) { description.delimitedAppend(pieces[i]); } xref.setDescription(description.toString()); } if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addDbXref(xref); } } //--------------------------------------------------------------------------- // Parse feature private void parseFT(String inLine) { if (mFormatVariant.equals(FormatVariant.EMBL)) { pareGenBankFeatureTableLine(inLine); } else if (mFormatVariant.equals(FormatVariant.UniProt)) { pareUniProtFeatureTableLine(inLine); } } private static final Pattern sGenBankFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?"); //--------------------------------------------------------------------------- private void pareGenBankFeatureTableLine(String inLine) { // Is there a feature key on this line? String featureKeyString = inLine.substring(5, 20).trim(); if (StringUtil.isSet(featureKeyString)) { GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString); if (null == featureKey) { throw new SeqIOException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!"); } String locationString = inLine.substring(21).trim(); mCurrentGenBankFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString)); if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentGenBankFeature); } // Unquote the previous qualifier if necessary if (mCurrentGenBankFeatureQualifier != null && mCurrentGenBankFeatureQualifier.getValue().startsWith("\"")) { mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue())); } mCurrentGenBankFeatureQualifier = null; } else { String content = inLine.substring(21).trim(); Matcher m = sGenBankFeatureQualifierPattern.matcher(content); if (m.matches()) { // New qualifier // Unquote the previous qualifier if necessary if (mCurrentGenBankFeatureQualifier != null && mCurrentGenBankFeatureQualifier.getValue().startsWith("\"")) { mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue())); } GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1)); if (null == qualifierName) { throw new SeqIOException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!"); } mCurrentGenBankFeatureQualifier = new GenBankFeatureQualifier(qualifierName); mCurrentGenBankFeature.addQualifier(mCurrentGenBankFeatureQualifier); String value = m.group(2); if (value != null) { mCurrentGenBankFeatureQualifier.appendToValue(value); } } else if (mCurrentGenBankFeatureQualifier != null) { // Continuation of a previous qualifier mCurrentGenBankFeatureQualifier.appendToValue(content); } else if (mCurrentGenBankFeature != null) { // Continuation of a feature location mCurrentGenBankFeature.getLocation().append(content); } } } // Example UniProt FT entry: // FT CHAIN 1 312 Methionyl-tRNA formyltransferase. // FT /FTId=PRO_1000077286. private static final Pattern sUniProt_FT_FirstLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{1,5}(\\w+)\\s{1,10}([\\?\\<]?\\d*)\\s{1,10}([\\?\\>]?\\d*)\\s+(.+)"); private static final Pattern sUniProt_FT_AdditionalLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{20,}(.+)"); //--------------------------------------------------------------------------- private void pareUniProtFeatureTableLine(String inLine) { Matcher m = sUniProt_FT_FirstLinePattern.matcher(inLine); if (m.matches()) { UniProtFeatureKey featureKey = UniProtFeatureKey.valueOf(m.group(1)); UniProtFeatureLocation location = new UniProtFeatureLocation(m.group(2), m.group(3)); mCurrentUniProtFeature = new UniProtFeature(featureKey, location).setDescription(m.group(4)); if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentUniProtFeature); } } else { m = sUniProt_FT_AdditionalLinePattern.matcher(inLine); if (m.matches()) { if (m.group(1).startsWith("/")) { String[] pieces = m.group(1).substring(1).split("="); if (2 != pieces.length) { throw new SeqIOException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!"); } UniProtFeatureQualifierName qualifierName = UniProtFeatureQualifierName.valueOf(pieces[0]); if (null == qualifierName) { throw new SeqIOException(StringUtil.singleQuote(pieces[0]) + " is not a recognized qualifier name!"); } mCurrentUniProtFeatureQualifier = new UniProtFeatureQualifier(qualifierName); mCurrentUniProtFeature.addQualifier(mCurrentUniProtFeatureQualifier); // Trim trailing period if (pieces[1].endsWith(".")) { pieces[1] = pieces[1].substring(0, pieces[1].length() - 1); } mCurrentUniProtFeatureQualifier.appendToValue(pieces[1]); } else { mCurrentUniProtFeature.appendDescription(m.group(1)); } } } } //########################################################################### // INNER CLASS //########################################################################### class SeqFilterReader extends LettersOnlyReader { //--------------------------------------------------------------------------- public SeqFilterReader(Reader inReader) { super(inReader); } //--------------------------------------------------------------------------- @Override public int read() throws IOException { int returnChar; do { returnChar = innerRead(); } while (returnChar >= 0 && (Character.isWhitespace(returnChar) || Character.isDigit(returnChar) || returnChar == '/')); return returnChar; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy