com.hfg.bio.seq.format.EMBL Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.DbXref;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.bio.seq.Clone;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.bio.seq.SeqTopology;
import com.hfg.bio.seq.format.feature.FeatureQualifier;
import com.hfg.bio.seq.format.feature.SeqFeature;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeature;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureKey;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureLocation;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifier;
import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifierName;
import com.hfg.bio.seq.format.feature.qualifier.MolType;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeature;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureKey;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureLocation;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifier;
import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifierName;
import com.hfg.bio.taxonomy.EMBL_TaxonDivision;
import com.hfg.bio.taxonomy.NCBITaxon;
import com.hfg.citation.Author;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.io.LettersOnlyReader;
//------------------------------------------------------------------------------
/**
EMBL sequence format.
See http://web.expasy.org/docs/userman.html for info on the Uniprot format variant.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
/*
Example record from ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt :
ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
XX
AC X56734; S46826;
XX
DT 12-SEP-1991 (Rel. 29, Created)
DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
XX
DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
XX
KW beta-glucosidase.
XX
OS Trifolium repens (white clover)
OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
XX
RN [5]
RP 1-1859
RX DOI; 10.1007/BF00039495.
RX PUBMED; 1907511.
RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
RT "Nucleotide and derived amino acid sequence of the cyanogenic
RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
RL Plant Mol. Biol. 17(2):209-219(1991).
XX
RN [6]
RP 1-1859
RA Hughes M.A.;
RT ;
RL Submitted (19-NOV-1990) to the INSDC.
RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
RL Upon Tyne, NE2 4HH, UK
XX
DR EuropePMC; PMC99098; 11752244.
XX
FH Key Location/Qualifiers
FH
FT source 1..1859
FT /organism="Trifolium repens"
FT /mol_type="mRNA"
FT /clone_lib="lambda gt10"
FT /clone="TRE361"
FT /tissue_type="leaves"
FT /db_xref="taxon:3899"
FT mRNA 1..1859
FT /experiment="experimental evidence, no additional details
FT recorded"
FT CDS 14..1495
FT /product="beta-glucosidase"
FT /EC_number="3.2.1.21"
FT /note="non-cyanogenic"
FT /db_xref="GOA:P26204"
FT /db_xref="InterPro:IPR001360"
FT /db_xref="InterPro:IPR013781"
FT /db_xref="InterPro:IPR017853"
FT /db_xref="InterPro:IPR018120"
FT /db_xref="UniProtKB/Swiss-Prot:P26204"
FT /protein_id="CAA40058.1"
FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
XX
SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
//
Figure 1 - A sample entry from the database
*/
public class EMBL extends ReadableSeqFormatBase
{
// Variables used during parsing
private T mCurrentSeq;
private FormatVariant mFormatVariant;
private String mCurrentLineCode;
private GenBankFeature mCurrentGenBankFeature;
private GenBankFeatureQualifier mCurrentGenBankFeatureQualifier;
private UniProtFeature mCurrentUniProtFeature;
private UniProtFeatureQualifier mCurrentUniProtFeatureQualifier;
private SeqCitation mCurrentCitation;
private Integer mSeqLengthFromIdLine;
private String mCurrentKeywords;
// Valid line codes
public static final String AC_LINE_CODE = "AC"; // AC - accession number (>=1 per entry)
public static final String AH_LINE_CODE = "AH"; // AH - assembly header (0 or 1 per entry)
public static final String AS_LINE_CODE = "AS"; // AS - assembly information (0 or >=1 per entry)
public static final String CC_LINE_CODE = "CC"; // CC - comments or notes (>=0 per entry)
public static final String CO_LINE_CODE = "CO"; // CO - contig/construct line (0 or >=1 per entry)
public static final String DE_LINE_CODE = "DE"; // DE - description (>=1 per entry)
public static final String DR_LINE_CODE = "DR"; // DR - database cross-reference (>=0 per entry)
public static final String DT_LINE_CODE = "DT"; // DT - date (2 per entry)
public static final String FH_LINE_CODE = "FH"; // FH - feature table header (2 per entry)
public static final String FT_LINE_CODE = "FT"; // FT - feature table data (>=2 per entry)
public static final String ID_LINE_CODE = "ID"; // ID - identification (begins each entry; 1 per entry)
public static final String KW_LINE_CODE = "KW"; // KW - keyword (>=1 per entry)
public static final String OC_LINE_CODE = "OC"; // OC - organism classification (>=1 per entry)
public static final String OG_LINE_CODE = "OG"; // OG - organelle (0 or 1 per entry)
public static final String OS_LINE_CODE = "OS"; // OS - organism species (>=1 per entry)
public static final String PR_LINE_CODE = "PR"; // PR - project identifier (0 or 1 per entry)
public static final String RA_LINE_CODE = "RA"; // RA - reference author(s) (>=0 per entry)
public static final String RC_LINE_CODE = "RC"; // RC - reference comment (>=0 per entry)
public static final String RG_LINE_CODE = "RG"; // RG - reference group (>=0 per entry)
public static final String RL_LINE_CODE = "RL"; // RL - reference location (>=1 per entry)
public static final String RN_LINE_CODE = "RN"; // RN - reference number (>=1 per entry)
public static final String RP_LINE_CODE = "RP"; // RP - reference positions (>=1 per entry)
public static final String RT_LINE_CODE = "RT"; // RT - reference title (>=1 per entry)
public static final String RX_LINE_CODE = "RX"; // RX - reference cross-reference (>=0 per entry)
public static final String SQ_LINE_CODE = "SQ"; // SQ - sequence header (1 per entry)
// EMBL-specific line codes
public static final String XX_LINE_CODE = "XX"; // XX - spacer line (many per entry)
// UniProt-specific line codes
public static final String GN_LINE_CODE = "GN"; //
public static final String OH_LINE_CODE = "OH"; // Organism host taxonomy cross-reference
public static final String OX_LINE_CODE = "OX"; // Organism taxonomy cross-reference
public static final String PE_LINE_CODE = "PE"; //
// Attributes populated into the sequence object
public static final String CLONE_ATTR = "Clone";
public static final String EMBL_DATA_CLASS_ATTR = "EMBL Data Class";
public static final String SPECIES_SCIENTIFIC_NAME_ATTR = "Species Scientific Name";
public static final String ORGANISM_CLASSIFICATION_ATTR = "Organism Classification";
public static final String ORGANISM_NCBI_TAXON_ID_ATTR = "Organism NCBI Taxon ID";
enum FormatVariant
{
EMBL,
UniProt
}
private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName());
static
{
LOGGER.setLevel(Level.WARNING);
LOGGER.setUseParentHandlers(true);
}
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public EMBL(BioSequenceFactory inSeqFactory)
{
super(inSeqFactory);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public static Logger getLogger()
{
return LOGGER;
}
//---------------------------------------------------------------------------
public boolean isEndOfRecord(String inLine)
{
return inLine.trim().equals("//");
}
//---------------------------------------------------------------------------
public boolean hasJanusDelimiter()
{
return false;
}
//---------------------------------------------------------------------------
public T readRecord(BufferedReader inReader)
throws SeqIOException
{
initRecordParsing();
boolean idLineFound = false;
try
{
mCurrentSeq = getBioSequenceFactory().createSeqObj();
String line;
while ((line = inReader.readLine()) != null)
{
if (! idLineFound)
{
String lineCode = line.substring(0, 2);
if (ID_LINE_CODE.equals(lineCode))
{
idLineFound = true;
}
else
{
continue;
}
}
else if (isEndOfRecord(line))
{
break;
}
if (idLineFound
&& StringUtil.isSet(line))
{
parseLine(line);
if (SQ_LINE_CODE.equals(mCurrentLineCode))
{
break;
}
}
}
// The rest of the record is assumed to be sequence
// Cleanup the sequence to remove spaces and numbers
Reader filterReader = new SeqFilterReader(inReader);
mCurrentSeq.setSequence(filterReader);
filterReader.close();
// Cleanup
if (mCurrentGenBankFeatureQualifier != null)
{
// The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES
if (mCurrentGenBankFeatureQualifier.getValue().startsWith("\""))
{
mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue()));
}
mCurrentGenBankFeatureQualifier = null;
if (mCurrentSeq instanceof BioSequencePlus)
{
List sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source);
if (CollectionUtil.hasValues(sourceFeatures))
{
SeqFeature source = sourceFeatures.get(0);
List cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name());
if (CollectionUtil.hasValues(cloneQualifiers))
{
Clone clone = new Clone(cloneQualifiers.get(0).getValue());
List subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name());
if (CollectionUtil.hasValues(subcloneQualifiers))
{
clone.setSubcloneName(subcloneQualifiers.get(0).getValue());
}
mCurrentSeq.setAttribute(CLONE_ATTR, clone);
}
}
}
}
// TODO: Clean the '.' off the end of feature descriptions
}
catch (Exception e)
{
throw new SeqIOException("Problem parsing EMBL record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e);
}
if (!idLineFound)
{
throw new SeqIOException("No " + ID_LINE_CODE + " line detected in the EMBL record!");
}
return mCurrentSeq;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private void initRecordParsing()
{
mCurrentSeq = null;
mCurrentLineCode = null;
mCurrentUniProtFeature = null;
mCurrentCitation = null;
mCurrentKeywords = "";
}
//---------------------------------------------------------------------------
private void parseLine(String inLine)
{
mCurrentLineCode = inLine.substring(0, 2);
switch (mCurrentLineCode)
{
case XX_LINE_CODE:
// Blank line
break;
case CC_LINE_CODE:
// Comment line
break;
case ID_LINE_CODE:
parseID(inLine);
break;
case AC_LINE_CODE:
// Accessions
parseAC(inLine);
break;
case DT_LINE_CODE:
// Date
//TODO: parseDT(inLine);
break;
case DE_LINE_CODE:
// Description
parseDE(inLine);
break;
case KW_LINE_CODE:
// Keywords
parseKW(inLine);
break;
case OS_LINE_CODE:
// Organism species
parseOS(inLine);
break;
case OC_LINE_CODE:
// Organism classification
parseOC(inLine);
break;
case RN_LINE_CODE:
// Reference number (start of a new reference)
mCurrentCitation = new SeqCitation();
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addReference(mCurrentCitation);
}
break;
case RA_LINE_CODE:
// Reference author(s)
parseRA(inLine);
break;
case RT_LINE_CODE:
// Reference title
parseRT(inLine);
break;
case RX_LINE_CODE:
// Reference cross-reference
parseRX(inLine);
break;
case RP_LINE_CODE:
// Reference positions
parseRP(inLine);
break;
case RL_LINE_CODE:
// Reference location
parseRL(inLine);
break;
case RC_LINE_CODE:
// Reference comment
//TODO
break;
case DR_LINE_CODE:
// Database cross-reference
parseDR(inLine);
break;
case AH_LINE_CODE:
case AS_LINE_CODE:
// Assembly info
// TODO
break;
case FH_LINE_CODE:
// Feature table header. Ignore
break;
case FT_LINE_CODE:
// Features
parseFT(inLine);
break;
case SQ_LINE_CODE:
// Sequence data
// TODO
break;
case GN_LINE_CODE:
if (mFormatVariant != FormatVariant.UniProt)
{
throw new SeqIOException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!");
}
break;
case OX_LINE_CODE:
if (mFormatVariant != FormatVariant.UniProt)
{
throw new SeqIOException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!");
}
parseOX(inLine);
break;
case PE_LINE_CODE:
if (mFormatVariant != FormatVariant.UniProt)
{
throw new SeqIOException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!");
}
break;
default:
throw new SeqIOException("Unrecognized line code: " + StringUtil.quote(mCurrentLineCode) + "!");
}
/*
GenBankKeyword keyword = getLineKeyword(inLine);
if (keyword != null)
{
// Found the start of a new keyword field
finishPreviousKeyword();
mCurrentLineCode = keyword;
mCurrentSubkeyword = null;
parseField(inLine);
}
else
{
// Continuation of an existing field
if (GenBankKeyword.FEATURES.equals(mCurrentLineCode))
{
// Features have a special set of feature keys
parseFeatures(inLine);
}
else
{
mCurrentSubkeyword = getLineSubkeyword(inLine);
if (mCurrentSubkeyword != null)
{
// Start of a new subfield
}
else
{
// Continuation of an existing subfield
}
parseField(inLine);
}
}
*/
}
//---------------------------------------------------------------------------
// Parse the ID line
// Ex #1: ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
// Ex #2: ID FMT_ANADE Reviewed; 312 AA.
//
// The ID (IDentification) line is always the first line of an entry. The
// format of the ID line is:
// ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
// The tokens represent:
// 1. Primary accession number
// 2. Sequence version number
// 3. Topology: 'circular' or 'linear'
// 4. Molecule type (see note 1 below)
// 5. Data class (see section 3.1)
// 6. Taxonomic division (see section 3.2)
// 7. Sequence length (see note 2 below)
private static final Pattern sEMBL_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+);\\s+SV\\s+(\\w+);\\s+(\\w+);\\s+(.+?);\\s+(\\w+);\\s+(\\w+);\\s+(\\w+) BP.");
private static final Pattern sUniProt_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+)\\s+(?:Reviewed|Unreviewd);\\s+(\\w+) AA.");
private static final Pattern sSeqLengthPattern = Pattern.compile("(\\w+) (AA|BP).");
private void parseID(String inLine)
{
Matcher m = sEMBL_ID_Pattern1.matcher(inLine);
if (m.matches())
{
mFormatVariant = FormatVariant.EMBL;
if (StringUtil.isSet(m.group(1)))
{
mCurrentSeq.setID(m.group(1) + '.' + m.group(2));
}
if (mCurrentSeq instanceof BioSequencePlus)
{
if (m.group(3) != null)
{
((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(3)));
}
if (m.group(4) != null)
{
((BioSequencePlus) mCurrentSeq).setMolType(MolType.retrieveOrCreateValueOf(m.group(4)));
}
mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, EMBL_DataClass.valueOf(m.group(5)));
((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(EMBL_TaxonDivision.valueOf(m.group(6)));
}
mSeqLengthFromIdLine = Integer.parseInt(m.group(7));
}
else
{
m = sUniProt_ID_Pattern1.matcher(inLine);
if (m.matches())
{
mFormatVariant = FormatVariant.UniProt;
}
else
{
// throw new SeqIOException("The " + ID_LINE_CODE + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!");
// Non-standard ID line format. Do the best we can.
String[] pieces = inLine.substring(2).split(";");
int pieceIndex = 1;
String id = pieces[0].trim().split("\\s+")[0].trim();
if (pieces.length > 1
&& pieces[1].trim().startsWith("SV"))
{
id += "." + pieces[1].trim().substring(2).trim();
pieceIndex++;
}
mCurrentSeq.setID(id);
if (mCurrentSeq instanceof BioSequencePlus)
{
BioSequencePlus bioSequencePlus = (BioSequencePlus) mCurrentSeq;
boolean topologyFound = false;
boolean molTypeFound = false;
boolean dataClassFound = false;
boolean divisionFound = false;
for (; pieceIndex < pieces.length; pieceIndex++)
{
String field = pieces[pieceIndex].trim();
if (! topologyFound)
{
SeqTopology seqTopology = SeqTopology.valueOf(field);
if (seqTopology != null)
{
bioSequencePlus.setSeqTopology(seqTopology);
topologyFound = true;
continue;
}
}
if (! molTypeFound)
{
MolType molType = MolType.valueOf(field);
if (molType != null)
{
bioSequencePlus.setMolType(MolType.valueOf(field));
molTypeFound = true;
continue;
}
}
if (! dataClassFound)
{
EMBL_DataClass dataClass = EMBL_DataClass.valueOf(field);
if (dataClass != null)
{
mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, dataClass);
dataClassFound = true;
continue;
}
}
if (! divisionFound)
{
EMBL_TaxonDivision div = EMBL_TaxonDivision.valueOf(field);
if (div != null)
{
bioSequencePlus.setSeqRepositoryDivision(div);
divisionFound = true;
continue;
}
}
if (pieceIndex == pieces.length - 1)
{
Matcher seqLengthMatcher = sSeqLengthPattern.matcher(field);
if (seqLengthMatcher.matches())
{
mSeqLengthFromIdLine = Integer.parseInt(seqLengthMatcher.group(1));
if (seqLengthMatcher.group(2).equalsIgnoreCase("BP"))
{
mFormatVariant = FormatVariant.EMBL;
}
else
{
mFormatVariant = FormatVariant.UniProt;
}
}
}
}
}
}
}
}
//---------------------------------------------------------------------------
private void parseAC(String inLine)
{
String[] pieces = inLine.substring(2).split(";\\s*");
if (null == mCurrentSeq.getID())
{
mCurrentSeq.setID(pieces[0].trim());
}
// TODO: handle additional ids
}
//---------------------------------------------------------------------------
private void parseDE(String inLine)
{
String description = inLine.substring(2).trim();
if (StringUtil.isSet(mCurrentSeq.getDescription()))
{
description = mCurrentSeq.getDescription() + " " + description;
}
mCurrentSeq.setDescription(description);
}
//---------------------------------------------------------------------------
// Parse keywords
private void parseKW(String inLine)
{
String keywordString = inLine.substring(2).trim();
if (mCurrentSeq instanceof BioSequencePlus)
{
mCurrentKeywords += keywordString;
if (mCurrentKeywords.endsWith("."))
{
String[] keywords = mCurrentKeywords.split("[;\\.]");
for (String keyword : keywords)
{
if (StringUtil.isSet(keyword))
{
((BioSequencePlus) mCurrentSeq).addKeyword(keyword.trim());
}
}
}
}
}
//---------------------------------------------------------------------------
// Parse the reference authors
private void parseRA(String inLine)
{
String[] authors = null;
if (mFormatVariant.equals(FormatVariant.EMBL))
{
authors = inLine.substring(2).split(";\\s*");
}
else if (mFormatVariant.equals(FormatVariant.UniProt))
{
authors = inLine.substring(2).split(",\\s*");
}
if (authors != null)
{
for (String author : authors)
{
if (StringUtil.isSet(author))
{
mCurrentCitation.addAuthor(new Author(author.trim()));
}
}
}
}
//---------------------------------------------------------------------------
// Parse the reference title
private void parseRT(String inLine)
{
String title = inLine.substring(2).trim();
if (title.endsWith(";"))
{
title = title.substring(0, title.length() - 1);
}
if (StringUtil.isSet(title))
{
if (mCurrentCitation.getTitle() != null)
{
title = mCurrentCitation.getTitle() + " " + title;
}
if (StringUtil.isQuoted(title))
{
title = StringUtil.unquote(title);
}
mCurrentCitation.setTitle(title);
}
}
//---------------------------------------------------------------------------
private void parseRX(String inLine)
{
String xref = inLine.substring(2).trim();
if (xref.endsWith("."))
{
xref = xref.substring(0, xref.length() - 1);
}
String[] pieces = xref.split(";\\s*");
if (pieces[0].equals("PUBMED"))
{
mCurrentCitation.setPubMedId(pieces[1]);
}
// TODO: handle other x-refs
}
//---------------------------------------------------------------------------
// Parse reference position value
private void parseRP(String inLine)
{
String positionString = inLine.substring(2).trim();
if (StringUtil.isSet(positionString))
{
String[] positionStrings = positionString.split("\\s*-\\s*");
if (2 == positionStrings.length)
{
mCurrentCitation.setSeqLocation(new SeqLocation().setStart(Integer.parseInt(positionStrings[0].trim())).setEnd(Integer.parseInt(positionStrings[1].trim())));
}
}
}
//---------------------------------------------------------------------------
// Parse reference location
private void parseRL(String inLine)
{
String stringValue = inLine.substring(2).trim();
if (StringUtil.isSet(stringValue))
{
mCurrentCitation.appendRawContent(stringValue);
}
}
//---------------------------------------------------------------------------
private void parseOS(String inLine)
{
String text = inLine.substring(2).trim();
if (null == mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR))
{
mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, text);
}
else
{
mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR) + " " + text);
}
}
//---------------------------------------------------------------------------
private void parseOC(String inLine)
{
String text = inLine.substring(2).trim();
if (null == mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR))
{
mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, text);
}
else
{
mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR) + " " + text);
}
}
//---------------------------------------------------------------------------
// Parse organism taxonomy cross-reference
// OX Taxonomy_database_Qualifier=Taxonomic code;
// Example:
// OX NCBI_TaxID=9606;
private void parseOX(String inLine)
{
String[] pieces = inLine.substring(2).trim().split("=");
// Remove trailing ';'
if (pieces[1].endsWith(";"))
{
pieces[1] = pieces[1].substring(0, pieces[1].length() - 1);
}
mCurrentSeq.setAttribute(ORGANISM_NCBI_TAXON_ID_ATTR, Integer.parseInt(pieces[1]));
if (mCurrentSeq instanceof BioSequencePlus)
{
NCBITaxon taxon = NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1]));
if (taxon != null)
{
((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon);
}
}
}
//---------------------------------------------------------------------------
// Parse database cross-reference
// The format of the DR line is:
// DR RESOURCE_ABBREVIATION; RESOURCE_IDENTIFIER; OPTIONAL_INFORMATION_1[; OPTIONAL_INFORMATION_2][; OPTIONAL_INFORMATION_3].
// Example:
// DR EMBL; U29082; AAA68403.1; -; Genomic_DNA.
private void parseDR(String inLine)
{
String[] pieces = inLine.substring(2).trim().split(";\\s*");
// Remove trailing '.' from the last piece
if (pieces[pieces.length - 1].endsWith("."))
{
pieces[pieces.length - 1] = pieces[pieces.length - 1].substring(0, pieces[pieces.length - 1].length() - 1);
}
DbXref xref = new DbXref(pieces[0], pieces[1]);
if (pieces.length > 2)
{
StringBuilderPlus description = new StringBuilderPlus().setDelimiter("; ");
for (int i = 2; i < pieces.length; i++)
{
description.delimitedAppend(pieces[i]);
}
xref.setDescription(description.toString());
}
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addDbXref(xref);
}
}
//---------------------------------------------------------------------------
// Parse feature
private void parseFT(String inLine)
{
if (mFormatVariant.equals(FormatVariant.EMBL))
{
pareGenBankFeatureTableLine(inLine);
}
else if (mFormatVariant.equals(FormatVariant.UniProt))
{
pareUniProtFeatureTableLine(inLine);
}
}
private static final Pattern sGenBankFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?");
//---------------------------------------------------------------------------
private void pareGenBankFeatureTableLine(String inLine)
{
// Is there a feature key on this line?
String featureKeyString = inLine.substring(5, 20).trim();
if (StringUtil.isSet(featureKeyString))
{
GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString);
if (null == featureKey)
{
throw new SeqIOException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!");
}
String locationString = inLine.substring(21).trim();
mCurrentGenBankFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString));
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addFeature(mCurrentGenBankFeature);
}
// Unquote the previous qualifier if necessary
if (mCurrentGenBankFeatureQualifier != null
&& mCurrentGenBankFeatureQualifier.getValue().startsWith("\""))
{
mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue()));
}
mCurrentGenBankFeatureQualifier = null;
}
else
{
String content = inLine.substring(21).trim();
Matcher m = sGenBankFeatureQualifierPattern.matcher(content);
if (m.matches())
{
// New qualifier
// Unquote the previous qualifier if necessary
if (mCurrentGenBankFeatureQualifier != null
&& mCurrentGenBankFeatureQualifier.getValue().startsWith("\""))
{
mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue()));
}
GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1));
if (null == qualifierName)
{
throw new SeqIOException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
}
mCurrentGenBankFeatureQualifier = new GenBankFeatureQualifier(qualifierName);
mCurrentGenBankFeature.addQualifier(mCurrentGenBankFeatureQualifier);
String value = m.group(2);
if (value != null)
{
mCurrentGenBankFeatureQualifier.appendToValue(value);
}
}
else if (mCurrentGenBankFeatureQualifier != null)
{
// Continuation of a previous qualifier
mCurrentGenBankFeatureQualifier.appendToValue(content);
}
else if (mCurrentGenBankFeature != null)
{
// Continuation of a feature location
mCurrentGenBankFeature.getLocation().append(content);
}
}
}
// Example UniProt FT entry:
// FT CHAIN 1 312 Methionyl-tRNA formyltransferase.
// FT /FTId=PRO_1000077286.
private static final Pattern sUniProt_FT_FirstLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{1,5}(\\w+)\\s{1,10}([\\?\\<]?\\d*)\\s{1,10}([\\?\\>]?\\d*)\\s+(.+)");
private static final Pattern sUniProt_FT_AdditionalLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{20,}(.+)");
//---------------------------------------------------------------------------
private void pareUniProtFeatureTableLine(String inLine)
{
Matcher m = sUniProt_FT_FirstLinePattern.matcher(inLine);
if (m.matches())
{
UniProtFeatureKey featureKey = UniProtFeatureKey.valueOf(m.group(1));
UniProtFeatureLocation location = new UniProtFeatureLocation(m.group(2), m.group(3));
mCurrentUniProtFeature = new UniProtFeature(featureKey, location).setDescription(m.group(4));
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addFeature(mCurrentUniProtFeature);
}
}
else
{
m = sUniProt_FT_AdditionalLinePattern.matcher(inLine);
if (m.matches())
{
if (m.group(1).startsWith("/"))
{
String[] pieces = m.group(1).substring(1).split("=");
if (2 != pieces.length)
{
throw new SeqIOException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
}
UniProtFeatureQualifierName qualifierName = UniProtFeatureQualifierName.valueOf(pieces[0]);
if (null == qualifierName)
{
throw new SeqIOException(StringUtil.singleQuote(pieces[0]) + " is not a recognized qualifier name!");
}
mCurrentUniProtFeatureQualifier = new UniProtFeatureQualifier(qualifierName);
mCurrentUniProtFeature.addQualifier(mCurrentUniProtFeatureQualifier);
// Trim trailing period
if (pieces[1].endsWith("."))
{
pieces[1] = pieces[1].substring(0, pieces[1].length() - 1);
}
mCurrentUniProtFeatureQualifier.appendToValue(pieces[1]);
}
else
{
mCurrentUniProtFeature.appendDescription(m.group(1));
}
}
}
}
//###########################################################################
// INNER CLASS
//###########################################################################
class SeqFilterReader extends LettersOnlyReader
{
//---------------------------------------------------------------------------
public SeqFilterReader(Reader inReader)
{
super(inReader);
}
//---------------------------------------------------------------------------
@Override
public int read()
throws IOException
{
int returnChar;
do
{
returnChar = innerRead();
}
while (returnChar >= 0
&& (Character.isWhitespace(returnChar)
|| Character.isDigit(returnChar)
|| returnChar == '/'));
return returnChar;
}
}
}