com.hfg.bio.seq.format.GenBank Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.DbXref;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.Clone;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.bio.seq.SeqTopology;
import com.hfg.bio.seq.format.feature.FeatureQualifier;
import com.hfg.bio.seq.format.feature.SeqFeature;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.format.feature.genbank.*;
import com.hfg.bio.seq.format.feature.qualifier.MolType;
import com.hfg.bio.seq.format.genbank.GenBankKeyword;
import com.hfg.bio.seq.format.genbank.GenBankSubkeyword;
import com.hfg.bio.seq.format.genbank.InvalidGenBankKeywordException;
import com.hfg.bio.seq.format.genbank.InvalidGenBankSubkeywordException;
import com.hfg.bio.taxonomy.NCBIGenBankDivision;
import com.hfg.bio.taxonomy.NCBITaxon;
import com.hfg.citation.Author;
import com.hfg.citation.CitationType;
import com.hfg.citation.PatentData;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.OrderedMap;
import com.hfg.util.io.LettersOnlyReader;
//------------------------------------------------------------------------------
/**
GenBank sequence format.
See ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt
See http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class GenBank extends ReadableSeqFormatBase implements WritableSeqFormat
{
// Variables used during parsing
private T mCurrentSeq;
private GenBankKeyword mCurrentKeyword;
private GenBankSubkeyword mCurrentSubkeyword;
private GenBankFeature mCurrentFeature;
private GenBankFeatureQualifier mCurrentFeatureQualifier;
private SeqCitation mCurrentReference;
private Integer mSeqLengthFromLocusLine;
private SimpleDateFormat mDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
private int mMaxExceptionsPerRecord = 0;
// private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)\\s+(?:[\\-\\w]+)?(?:\\s+(\\w+))?\\s+(\\w{3})\\s+(\\S{11})");
// private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?");
private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s+(\\S+)?\\s+(?:\\S+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA|cRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?");
private static final Pattern sFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?");
private static final Pattern sReferenceLocationPattern = Pattern.compile("\\(bases (\\d+) to (\\d+)\\)");
private static final Pattern sReferencePatentPattern = Pattern.compile("Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});");
public static final String COMMENT_ATTR = "Comment";
public static final String NCBI_GI_ATTR = "NCBI GI";
public static final String CONTIG_ATTR = "Contig";
private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName());
static
{
LOGGER.setLevel(Level.WARNING);
LOGGER.setUseParentHandlers(true);
}
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public GenBank(BioSequenceFactory inSeqFactory)
{
super(inSeqFactory);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public static Logger getLogger()
{
return LOGGER;
}
//---------------------------------------------------------------------------
public boolean isEndOfRecord(String inLine)
{
return inLine.trim().equals("//");
}
//---------------------------------------------------------------------------
public boolean hasJanusDelimiter()
{
return false;
}
//---------------------------------------------------------------------------
/**
Specify the maximum number of Exceptions to tolerate per record. Defaults to zero.
This mechanism will only work with sequences objects that implement the BioSequencePlus interface.
If a record produces less than the specified maximum number of Exceptions, the
Exceptions can be retrieved via the getParseExceptions() method on the
BioSequencePlus sequence object.
* @param inValue the maximum number of Exceptions to tolerate per record
* @return this format object to facilitate method chaining.
*/
public GenBank setMaxExceptionsPerRecord(int inValue)
{
mMaxExceptionsPerRecord = inValue;
return this;
}
//---------------------------------------------------------------------------
public T readRecord(BufferedReader inReader)
throws SeqIOException
{
initRecordParsing();
boolean locusLineFound = false;
mCurrentSeq = getBioSequenceFactory().createSeqObj();
try
{
int lineCount = 0;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
try
{
if (!locusLineFound)
{
try
{
GenBankKeyword keyword = getLineKeyword(line);
if (GenBankKeyword.LOCUS.equals(keyword))
{
locusLineFound = true;
}
else
{
continue;
}
}
catch (InvalidGenBankKeywordException e)
{
// Ignore
continue;
}
}
else if (isEndOfRecord(line))
{
break;
}
if (locusLineFound
&& StringUtil.isSet(line))
{
parseLine(line);
if (GenBankKeyword.ORIGIN.equals(mCurrentKeyword))
{
break;
}
}
}
catch(Exception e)
{
SeqIOException seqIOException = new SeqIOException("Problem parsing "
+ (StringUtil.isSet(mCurrentSeq.getID()) ? mCurrentSeq.getID() + " " : "")
+ "record line " + lineCount + " : " + StringUtil.singleQuote(line), e);
if (mMaxExceptionsPerRecord > 0
&& mCurrentSeq instanceof BioSequencePlus
&& (! ((BioSequencePlus) mCurrentSeq).hadParseExceptions()
|| ((BioSequencePlus) mCurrentSeq).getParseExceptions().size() < mMaxExceptionsPerRecord))
{
((BioSequencePlus) mCurrentSeq).addParseException(seqIOException);
GenBank.getLogger().warning(e.getMessage());
}
else
{
throw seqIOException;
}
}
}
// The rest of the record is assumed to be sequence
// Cleanup the sequence to remove spaces and numbers
Reader filterReader = new GenBankSeqFilterReader(inReader);
mCurrentSeq.setSequence(filterReader);
filterReader.close();
}
catch (Exception e)
{
throw new SeqIOException("Problem parsing GenBank record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e);
}
if (! locusLineFound)
{
throw new SeqIOException("No LOCUS line detected in the GenBank record!");
}
return mCurrentSeq;
}
//---------------------------------------------------------------------------
public String write(Collection inSeqs)
throws SeqIOException
{
StringWriter writer = new StringWriter();
for (T seq : inSeqs)
{
write(seq, writer);
}
return writer.toString();
}
//---------------------------------------------------------------------------
public String write(T inSeq)
throws SeqIOException
{
StringWriter writer = new StringWriter();
write(inSeq, writer);
return writer.toString();
}
//---------------------------------------------------------------------------
public void write(T inSeq, OutputStream inStream)
throws SeqIOException
{
Writer writer = new OutputStreamWriter(inStream);
write(inSeq, writer);
try
{
writer.flush();
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//---------------------------------------------------------------------------
public void write(T inSeq, Writer inWriter)
throws SeqIOException
{
Reader seqReader = null;
BufferedWriter writer = null;
try
{
try
{
if (writer instanceof BufferedWriter)
{
writer = (BufferedWriter) inWriter;
} else
{
writer = new BufferedWriter(inWriter, 8196);
}
// Write the LOCUS line
writeLocus(inSeq, writer);
// Write the DEFINTION line(s)
writeDefinition(inSeq, writer);
// Write the ACCESSION line
writeAccession(inSeq, writer);
// Write the VERSION line
writeVersion(inSeq, writer);
// TODO: SOURCE
if (inSeq instanceof BioSequencePlus)
{
BioSequencePlus seqPlus = (BioSequencePlus) inSeq;
if (CollectionUtil.hasValues(seqPlus.getDbXrefs()))
{
writeDBLinks(seqPlus.getDbXrefs(), writer);
}
if (CollectionUtil.hasValues(seqPlus.getReferences()))
{
writeReferences(seqPlus.getReferences(), writer);
}
// Write features
if (CollectionUtil.hasValues(seqPlus.getFeatures()))
{
writer.write(GenBankKeyword.FEATURES + " Location/Qualifiers\n");
for (SeqFeature seqFeature : seqPlus.getFeatures())
{
writeFeature(seqFeature, writer);
}
}
}
// Write the sequence lines
writer.write(GenBankKeyword.ORIGIN + "\n");
seqReader = inSeq.getSequenceReader();
int bufferSize = 60;
char[] buffer = new char[bufferSize];
int residueNum = 1;
int numBytesRead;
while ((numBytesRead = seqReader.read(buffer)) != -1)
{
if (numBytesRead < bufferSize)
{
int secondNumBytesRead = seqReader.read(buffer, numBytesRead, buffer.length - numBytesRead);
if (secondNumBytesRead != -1)
{
numBytesRead += secondNumBytesRead;
}
}
writer.write(String.format("%9d", residueNum));
for (int i = 0; i < numBytesRead; i += 10)
{
writer.write(" ");
writer.write(buffer, i, i + 10 > numBytesRead ? numBytesRead - i: 10);
}
writer.write("\n");
residueNum += numBytesRead;
}
// Write end of record line
writer.write("//\n");
}
finally
{
if (seqReader != null)
{
seqReader.close();
}
if (writer != null)
{
writer.flush();
}
}
}
catch (SeqIOException e)
{
throw e;
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private void initRecordParsing()
{
mCurrentSeq = null;
mCurrentKeyword = null;
mCurrentSubkeyword = null;
mCurrentFeature = null;
mCurrentFeatureQualifier = null;
mCurrentReference = null;
}
//---------------------------------------------------------------------------
// A keyword starts in column one and has a maximum of 10 characters.
private GenBankKeyword getLineKeyword(String inLine)
{
GenBankKeyword keyword = null;
if (Character.isLetter(inLine.charAt(0)))
{
String keywordString = (inLine.length() > 11 ? inLine.substring(0, 11) : inLine).trim();
if (StringUtil.isSet(keywordString))
{
keyword = GenBankKeyword.valueOf(keywordString);
if (null == keyword)
{
throw new InvalidGenBankKeywordException(StringUtil.singleQuote(keywordString) + " is not a recognized GenBank keyword!");
}
}
}
return keyword;
}
//---------------------------------------------------------------------------
// A subkeyword starts in column three and has a maximum of 8 characters.
private GenBankSubkeyword getLineSubkeyword(String inLine)
{
GenBankSubkeyword subkeyword = null;
if (Character.isWhitespace(inLine.charAt(0))
&& Character.isWhitespace(inLine.charAt(1)))
{
String subkeywordString = (inLine.length() > 11 ? inLine.substring(2, 11) : inLine).trim();
if (StringUtil.isSet(subkeywordString))
{
subkeyword = GenBankSubkeyword.valueOf(subkeywordString);
if (null == subkeyword)
{
throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword!");
}
else if (! mCurrentKeyword.allowsSubkeyword(subkeyword))
{
throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword of " + mCurrentKeyword + "!");
}
}
}
return subkeyword;
}
//---------------------------------------------------------------------------
private void finishPreviousKeyword()
{
if (GenBankKeyword.DEFINITION.equals(mCurrentKeyword))
{
finishDefinition();
}
else if (mCurrentFeatureQualifier != null)
{
// The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES
if (mCurrentFeatureQualifier.getValue().startsWith("\""))
{
mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
}
mCurrentFeatureQualifier = null;
if (mCurrentSeq instanceof BioSequencePlus)
{
List sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source);
if (CollectionUtil.hasValues(sourceFeatures))
{
SeqFeature source = sourceFeatures.get(0);
List molTypeQualifiers = source.getQualifiers(GenBankFeatureQualifierName.mol_type.name());
if (CollectionUtil.hasValues(molTypeQualifiers))
{
MolType molType = MolType.valueOf(molTypeQualifiers.get(0).getValue());
if (molType != null)
{
((BioSequencePlus) mCurrentSeq).setMolType(molType);
}
}
// /db_xref="taxon:9606"
List dbXrefQualifiers = source.getQualifiers(GenBankFeatureQualifierName.db_xref.name());
if (CollectionUtil.hasValues(dbXrefQualifiers))
{
for (FeatureQualifier qualifier : dbXrefQualifiers)
{
String[] pieces = qualifier.getValue().split(":");
if (pieces[0].equals("taxon"))
{
((BioSequencePlus) mCurrentSeq).setNCBITaxon(NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1])));
break;
}
}
}
List cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name());
if (CollectionUtil.hasValues(cloneQualifiers))
{
Clone clone = new Clone(cloneQualifiers.get(0).getValue());
List subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name());
if (CollectionUtil.hasValues(subcloneQualifiers))
{
clone.setSubcloneName(subcloneQualifiers.get(0).getValue());
}
((BioSequencePlus) mCurrentSeq).setClone(clone);
}
}
}
}
}
//---------------------------------------------------------------------------
private void parseLine(String inLine)
throws Exception
{
GenBankKeyword keyword = getLineKeyword(inLine);
if (keyword != null)
{
// Found the start of a new keyword field
finishPreviousKeyword();
mCurrentKeyword = keyword;
mCurrentSubkeyword = null;
parseField(inLine);
}
else
{
// Continuation of an existing field
if (GenBankKeyword.FEATURES.equals(mCurrentKeyword))
{
// Features have a special set of feature keys
parseFeatures(inLine);
}
else
{
GenBankSubkeyword subkeyword = getLineSubkeyword(inLine);
if (subkeyword != null)
{
// Start of a new subfield
mCurrentSubkeyword = subkeyword;
}
else
{
// Continuation of an existing subfield
}
parseField(inLine);
}
}
}
//---------------------------------------------------------------------------
private void parseField(String inLine)
throws Exception
{
if (mCurrentKeyword.equals(GenBankKeyword.LOCUS))
{
parseLocus(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.DEFINITION))
{
parseDefinition(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.VERSION))
{
parseVersion(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.KEYWORDS))
{
parseKeywords(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.SOURCE))
{
parseSource(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.REFERENCE))
{
parseReference(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.COMMENT))
{
parseComment(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.DBLINK))
{
parseDBLink(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.FEATURES))
{
parseFeatures(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.CONTIG))
{
parseContig(inLine);
}
// NID is skipped
// PROJECT is skipped
// SEGMENT is skipped
// BASE COUNT is skipped
}
//---------------------------------------------------------------------------
// Parse the LOCUS keyword line
// Ex:
// LOCUS R88064 460 bp mRNA linear EST 16-AUG-1995
// LOCUS pDR000029812 7616 bp circular
// LOCUS vDR\365 8070 bp DNA circular 21-MAR-2011
// PairwiseSeqAligner aligner = new PairwiseSeqAligner(alignmentSettings);
// Although it isn't always followed exactly, the detailed format for the LOCUS line format is as follows:
//
// Positions Contents
// --------- --------
// 01-05 'LOCUS'
// 06-12 spaces
// 13-28 Locus name
// 29-29 space
// 30-40 Length of sequence, right-justified
// 41-41 space
// 42-43 bp
// 44-44 space
// 45-47 spaces, ss- (single-stranded), ds- (double-stranded), or
// ms- (mixed-stranded)
// 48-53 NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA),
// mRNA (messenger RNA), uRNA (small nuclear RNA).
// Left justified.
// 54-55 space
// 56-63 'linear' followed by two spaces, or 'circular'
// 64-64 space
// 65-67 The division code
// 68-68 space
// 69-79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
//
private void parseLocus(String inLine)
{
Matcher m = sLocusPattern.matcher(inLine);
if (! m.matches())
{
throw new SeqIOException("The " + GenBankKeyword.LOCUS + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!");
}
if (StringUtil.isSet(m.group(1)))
{
mCurrentSeq.setID(m.group(1));
}
mSeqLengthFromLocusLine = Integer.parseInt(m.group(2).trim());
// TODO: Group 3 is the detailed sequence type
if (mCurrentSeq instanceof BioSequencePlus)
{
if (StringUtil.isSet(m.group(4)))
{
((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(4)));
}
if (StringUtil.isSet(m.group(5)))
{
((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(NCBIGenBankDivision.valueOf(m.group(5)));
}
if (StringUtil.isSet(m.group(6)))
{
try
{
((BioSequencePlus) mCurrentSeq).setRevisionDate(mDateFormat.parse(m.group(6)));
}
catch (ParseException e)
{
// TODO: Add to warnings?
System.err.println(e.getMessage());
}
}
}
}
//---------------------------------------------------------------------------
private String getAccession(T inSeq)
{
String acc = inSeq.getID();
if (StringUtil.isSet(acc))
{
int index = acc.indexOf(".");
if (index > 0)
{
acc = acc.substring(0, index);
}
}
else
{
acc = "";
}
return acc;
}
//---------------------------------------------------------------------------
private void writeLocus(T inSeq, Writer inWriter)
throws IOException
{
inWriter.write(String.format("%5s %-16.16s %11d %2.2s",
GenBankKeyword.LOCUS,
getAccession(inSeq),
inSeq.length(),
inSeq.getType().equals(BioSequenceType.PROTEIN) ? "aa" : "bp"));
if (inSeq instanceof BioSequencePlus)
{
BioSequencePlus bioSequencePlus = (BioSequencePlus) inSeq;
String molTypeString = "";
if (bioSequencePlus.getMolType() != null)
{
if (bioSequencePlus.getMolType().equals(MolType.genomic_DNA)
|| bioSequencePlus.getMolType().equals(MolType.unassigned_DNA)
|| bioSequencePlus.getMolType().equals(MolType.other_DNA))
{
molTypeString = "DNA";
}
else if (bioSequencePlus.getMolType().equals(MolType.genomic_RNA)
|| bioSequencePlus.getMolType().equals(MolType.transcribed_RNA)
|| bioSequencePlus.getMolType().equals(MolType.unassigned_RNA)
|| bioSequencePlus.getMolType().equals(MolType.other_RNA))
{
molTypeString = "RNA";
}
else if (bioSequencePlus.getMolType().equals(MolType.mRNA))
{
molTypeString = "mRNA";
}
else if (bioSequencePlus.getMolType().equals(MolType.tRNA))
{
molTypeString = "tRNA";
}
}
// TODO: 'ss-', 'ds-', or 'ms-' prefix for the mol type isn't parsed or output
inWriter.write(String.format(" %-6.6s %-8.8s %-3.3s %s",
molTypeString,
bioSequencePlus.getSeqTopology() != null ? bioSequencePlus.getSeqTopology() : "",
bioSequencePlus.getSeqRepositoryDivision() != null ? bioSequencePlus.getSeqRepositoryDivision().getCode() : "",
bioSequencePlus.getRevisionDate() != null ? mDateFormat.format(bioSequencePlus.getRevisionDate()).toUpperCase() : ""
));
}
inWriter.write("\n");
}
//---------------------------------------------------------------------------
// Parse the DEFINITION keyword line
// Ex:
// DEFINITION ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone
// IMAGE:165908 5', mRNA sequence.
//
private void parseDefinition(String inLine)
{
String field = inLine.substring(12).trim();
if (null == mCurrentSeq.getDescription())
{
mCurrentSeq.setDescription(field);
}
else
{
mCurrentSeq.setDescription(mCurrentSeq.getDescription() + " " + field);
}
}
//---------------------------------------------------------------------------
// Remove the trailing period
private void finishDefinition()
{
String definition = mCurrentSeq.getDescription();
if (definition != null
&& definition.endsWith("."))
{
mCurrentSeq.setDescription(definition.substring(0, definition.length() - 1));
}
}
//---------------------------------------------------------------------------
private void writeDefinition(T inSeq, Writer inWriter)
throws IOException
{
if (StringUtil.isSet(inSeq.getDescription()))
{
String description = inSeq.getDescription();
if (! description.endsWith("."))
{
description += ".";
}
String[] lines = StringUtil.lines(StringUtil.wrap(description, 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format("%-10s %s\n",
0 == i ? GenBankKeyword.DEFINITION : "",
lines[i]));
}
}
}
//---------------------------------------------------------------------------
private void writeAccession(T inSeq, Writer inWriter)
throws IOException
{
if (StringUtil.isSet(inSeq.getID()))
{
inWriter.write(String.format("%-9.9s %s\n",
GenBankKeyword.ACCESSION,
getAccession(inSeq)));
}
}
//---------------------------------------------------------------------------
private void writeVersion(T inSeq, Writer inWriter)
throws IOException
{
if (StringUtil.isSet(inSeq.getID()))
{
inWriter.write(String.format("%-9.9s %s\n",
GenBankKeyword.VERSION,
inSeq.getID()));
}
}
//---------------------------------------------------------------------------
// Parse the VERSION keyword line
// Ex:
// VERSION AF181452.1 GI:6017929
// ^^^^^^^^^^ ^^^^^^^^^^
// Compound NCBI GI
// Accession Identifier
// Number
//
private void parseVersion(String inLine)
{
String fields[] = inLine.substring(12).trim().split("\\s+");
mCurrentSeq.setID(fields[0]);
if (fields.length > 1)
{
mCurrentSeq.setAttribute(NCBI_GI_ATTR, fields[1]);
}
}
//---------------------------------------------------------------------------
// Parse the KEYWORDS keyword line
// Ex:
// KEYWORDS EST.
//
private void parseKeywords(String inLine)
{
String field = inLine.substring(12).trim();
if (field.endsWith("."))
{
field = field.substring(0, field.length() - 1);
}
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addKeywords(field.split(",\\s+"));
}
}
//---------------------------------------------------------------------------
// Parse the SOURCE keyword line
// Ex:
// SOURCE Homo sapiens (human)
// ORGANISM Homo sapiens
// Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
// Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
// Catarrhini; Hominidae; Homo.
//
// The SOURCE field consists of two parts. The first part is found after
// the SOURCE keyword and contains free-format information including an
// abbreviated form of the organism name followed by a molecule type;
// multiple lines are allowed, but the last line must end with a period.
// The second part consists of information found after the ORGANISM
// subkeyword. The formal scientific name for the source organism (genus
// and species, where appropriate) is found on the same line as ORGANISM.
// The records following the ORGANISM line list the taxonomic
// classification levels, separated by semicolons and ending with a
// period.
//
private void parseSource(String inLine)
{
if (GenBankSubkeyword.ORGANISM.equals(mCurrentSubkeyword))
{
String field = inLine.substring(12).trim();
// For now, just keep the first line with the scientific name
if (inLine.trim().startsWith(GenBankSubkeyword.ORGANISM.name())
&& mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).setNCBITaxon(NCBITaxon.getByName(field));
}
}
}
//---------------------------------------------------------------------------
// Parse the REFERENCE keyword line
// Ex:
// REFERENCE 1 (bases 1 to 342)
// AUTHORS Giachino,C., Padovan,E. and Lanzavecchia,A.
// TITLE kappa+lambda+ dual receptor B cells are present in the human
// peripheral repertoire
// JOURNAL J. Exp. Med. 181 (3), 1245-1250 (1995)
// PUBMED 7869042
//
// Publications by the authors of the sequence that discuss the data reported in
// the record. References are automatically sorted within the record based on date
// of publication, showing the oldest references first.
//
// Some sequences have not been reported in papers and show a status of "unpublished"
// or "in press". When an accession number and/or sequence data has appeared in print,
// sequence authors should send the complete citation of the article to [email protected]
// and the GenBank staff will revise the record.
//
// Various classes of publication can be present in the References field, including
// journal article, book chapter, book, thesis/monograph, proceedings chapter, proceedings
// from a meeting, and patent.
//
// The last citation in the REFERENCE field usually contains information about the
// submitter of the sequence, rather than a literature citation. It is therefore
// called the "submitter block" and shows the words "Direct Submission" instead of
// an article title. Additional information is provided below, under the header Direct
// Submission. Some older records do not contain a submitter block.
private void parseReference(String inLine)
throws ParseException
{
if (inLine.startsWith(GenBankKeyword.REFERENCE.name()))
{
// REFERENCE 2 (bases 1 to 200000)
mCurrentReference = new SeqCitation();
// TODO: Set the reference seq location
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addReference(mCurrentReference);
}
Matcher m = sReferenceLocationPattern.matcher(inLine);
if (m.find())
{
mCurrentReference.setSeqLocation(new SeqLocation(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))));
}
}
else
{
String field = inLine.substring(12).trim();
if (GenBankSubkeyword.AUTHORS.equals(mCurrentSubkeyword))
{
// Remove trailing comma if present.
if (field.endsWith(","))
{
field = field.substring(0, field.length() - 1);
}
else if (field.endsWith(" and"))
{
field = field.substring(0, field.length() - 4);
}
if (! field.equals("."))
{
String[] authorStrings = field.split("(,\\s+|\\s+and\\s+)");
for (String authorString : authorStrings)
{
mCurrentReference.addAuthor(new Author(authorString));
}
}
}
else if (GenBankSubkeyword.TITLE.equals(mCurrentSubkeyword))
{
String title = field;
if (mCurrentReference.getTitle() != null)
{
title = mCurrentReference.getTitle() + " " + title;
}
mCurrentReference.setTitle(title);
}
else if (GenBankSubkeyword.JOURNAL.equals(mCurrentSubkeyword))
{
mCurrentReference.appendRawContent(field);
if (null == mCurrentReference.getType()
|| mCurrentReference.getType().equals(CitationType.general)) // General is the initial default type
{
Matcher m = sReferencePatentPattern.matcher(field);
if (m.matches())
{
mCurrentReference.setType(CitationType.patent);
PatentData patentData = new PatentData()
.setTitle(mCurrentReference.getTitle())
.setInventors(mCurrentReference.getAuthors())
.setPublicationNum(m.group(1))
.setSeqIdNum(Integer.parseInt(m.group(2)))
.setPublicationDate(mDateFormat.parse(m.group(3)));
mCurrentReference.setPatentData(patentData);
}
}
else if (mCurrentReference.getType().equals(CitationType.patent))
{
String[] applicants = field.split(";");
for (String applicant : applicants)
{
mCurrentReference.getPatentData().addApplicants(applicant.trim());
}
}
}
else if (GenBankSubkeyword.PUBMED.equals(mCurrentSubkeyword))
{
mCurrentReference.setPubMedId(field);
}
else if (GenBankSubkeyword.REMARK.equals(mCurrentSubkeyword))
{
String remark = field;
if (mCurrentReference.getRemark() != null)
{
remark = mCurrentReference.getRemark() + " " + remark;
}
mCurrentReference.setRemark(remark);
}
}
}
//---------------------------------------------------------------------------
// Parse the COMMENT keyword line
// Ex:
// COMMENT Contact: Wilson RK
// Washington University School of Medicine
// 4444 Forest Park Parkway, Box 8501, St. Louis, MO 63108
// Tel: 314 286 1800
// Fax: 314 286 1810
// Email: [email protected]
// Insert Size: 1482
// High quality sequence stops: 353 Source: IMAGE Consortium, LLNL
// This clone is available royalty-free through LLNL ; contact the
// IMAGE Consortium ([email protected]) for further information.
// Insert Length: 1482 Std Error: 0.00
// Seq primer: M13RP1
// High quality sequence stop: 353.
//
private void parseComment(String inLine)
{
String field = inLine.substring(12).trim();
if (mCurrentSeq.getAttribute(COMMENT_ATTR) != null)
{
mCurrentSeq.setAttribute(COMMENT_ATTR, mCurrentSeq.getAttribute(COMMENT_ATTR) + "\n" + field);
}
else
{
mCurrentSeq.setAttribute(COMMENT_ATTR, field);
}
}
//---------------------------------------------------------------------------
// Parse the DBLINK keyword line
// Ex:
// DBLINK BioProject:PRJNA174162,PRJNA999998,PRJNA999999
// BioSample: SAMN01795900
//
// "This line contains cross-references to other underlying resources that
// support the existence of a GenBank sequence record...
// A DBLINK cross-reference consists of two data fields delimited by a colon.
// The first field provides the cross-reference type ("BioProject"), while the
// second contains the actual cross-reference identifier ("PRJNA177352").
// The second field can consist of multiple comma-separated identifiers,
// if a sequence record has multiple DBLINK cross-references of a given type."
//
private void parseDBLink(String inLine)
{
if (mCurrentSeq instanceof BioSequencePlus)
{
BioSequencePlus sequencePlus = (BioSequencePlus) mCurrentSeq;
String field = inLine.substring(12).trim();
String[] pieces = field.split(":");
if (2 == pieces.length)
{
String[] values = pieces[1].split(",");
for (String value : values)
{
sequencePlus.addDbXref(new DbXref(pieces[0].trim(), value.trim()));
}
}
else
{
// Continuation of previous db identifiers
String db = sequencePlus.getDbXrefs().get(sequencePlus.getDbXrefs().size() - 1).getDB();
String[] values = pieces[1].split(",");
for (String value : values)
{
sequencePlus.addDbXref(new DbXref(db, value.trim()));
}
}
}
}
//---------------------------------------------------------------------------
private void writeDBLinks(List inDBXrefs, Writer inWriter)
throws IOException
{
Map xRefMap = new OrderedMap<>(4);
for (DbXref xref : inDBXrefs)
{
StringBuilderPlus line = xRefMap.get(xref.getDB());
if (null == line)
{
line = new StringBuilderPlus(xref.getDB() + ":" + xref.getId());
xRefMap.put(xref.getDB(), line);
}
else
{
line.delimitedAppend(xref.getId());
}
}
int count = 0;
for (String db : xRefMap.keySet())
{
String[] lines = StringUtil.lines(StringUtil.wrap(xRefMap.get(db).toString(), 67));
for (String line : lines)
{
count++;
inWriter.write(String.format("%-12.12s%s", (1 == count ? GenBankKeyword.DBLINK : ""), line));
}
}
}
//---------------------------------------------------------------------------
// Parse the CONTIG keyword line
// Ex:
// CONTIG join(D86993.1:7160..39752,D87004.2:803..13993)
//
private void parseContig(String inLine)
{
String field = inLine.substring(12).trim();
if (mCurrentSeq.getAttribute(CONTIG_ATTR) != null)
{
mCurrentSeq.setAttribute(CONTIG_ATTR, mCurrentSeq.getAttribute(CONTIG_ATTR) + field);
}
else
{
mCurrentSeq.setAttribute(CONTIG_ATTR, field);
}
}
//---------------------------------------------------------------------------
private void parseFeatures(String inLine)
{
if (! inLine.startsWith(GenBankKeyword.FEATURES.name()))
{
// Is there a feature key on this line?
String featureKeyString = inLine.substring(5, 20).trim();
if (StringUtil.isSet(featureKeyString))
{
GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString);
if (null == featureKey)
{
throw new SeqIOException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!");
}
String locationString = inLine.substring(21).trim();
mCurrentFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString));
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addFeature(mCurrentFeature);
}
// Unquote the previous qualifier if necessary
if (mCurrentFeatureQualifier != null
&& mCurrentFeatureQualifier.getValue().startsWith("\""))
{
mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
}
mCurrentFeatureQualifier = null;
}
else
{
String content = inLine.substring(21).trim();
Matcher m = sFeatureQualifierPattern.matcher(content);
if (m.matches())
{
// New qualifier
// Unquote the previous qualifier if necessary
if (mCurrentFeatureQualifier != null
&& mCurrentFeatureQualifier.getValue().startsWith("\""))
{
mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
}
GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1));
if (null == qualifierName)
{
throw new SeqIOException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
}
mCurrentFeatureQualifier = new GenBankFeatureQualifier(qualifierName);
mCurrentFeature.addQualifier(mCurrentFeatureQualifier);
String value = m.group(2);
if (value != null)
{
mCurrentFeatureQualifier.appendToValue(value);
}
}
else if (mCurrentFeatureQualifier != null)
{
// Continuation of a previous qualifier
mCurrentFeatureQualifier.appendToValue(content);
}
else if (mCurrentFeature != null)
{
// Continuation of a feature location
mCurrentFeature.getLocation().append(content);
}
}
}
}
//---------------------------------------------------------------------------
private void writeReferences(List inSeqCitations, Writer inWriter)
throws IOException
{
int count = 0;
for (SeqCitation citation : inSeqCitations)
{
count++;
inWriter.write(String.format("%s %-3d%s\n",
GenBankKeyword.REFERENCE,
count,
citation.getSeqLocation() != null ? "(bases " + citation.getSeqLocation().getStart() + " to " + citation.getSeqLocation().getEnd() + ")" : ""));
// Authors
StringBuilderPlus authors = new StringBuilderPlus().setDelimiter(", ");
for (int i = 0; i < citation.getAuthors().size(); i++)
{
Author author = citation.getAuthors().get(i);
if (citation.getAuthors().size() > 1
&& i == citation.getAuthors().size() - 1)
{
authors.append(" and " + author.getLastName() + "," + author.getFirstInitial() + ".");
}
else
{
authors.delimitedAppend(author.getLastName() + "," + author.getFirstInitial() + ".");
}
}
String[] lines = StringUtil.lines(StringUtil.wrap(authors.toString(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-7.7s %s\n",
0 == i ? GenBankSubkeyword.AUTHORS.name() : "",
lines[i]));
}
// Title
lines = StringUtil.lines(StringUtil.wrap(citation.getTitle(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-5.5s %s\n",
0 == i ? GenBankSubkeyword.TITLE.name() : "",
lines[i]));
}
// Journal
// JOURNAL J. Exp. Med. 188 (11), 2151-2162 (1998)
StringBuilderPlus journal = new StringBuilderPlus();
if (StringUtil.isSet(citation.toString()))
{
journal.append(citation.toString());
}
else
{
journal.append(citation.getJournal())
.append(".")
.append(citation.getVolume() != null ? " " + citation.getVolume() : "")
.append(citation.getIssue() != null ? " (" + citation.getIssue() + ")" : "")
.append(citation.getPages() != null ? ", " + citation.getPages() : "")
.append(citation.getYear() != null ? " (" + citation.getYear() + ")" : "");
}
lines = StringUtil.lines(StringUtil.wrap(journal.toString(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-7.7s %s\n",
0 == i ? GenBankSubkeyword.JOURNAL.name() : "",
lines[i]));
}
// Pubmed id
if (StringUtil.isSet(citation.getPubMedId()))
{
inWriter.write(String.format(" %-6.6s %s\n",
GenBankSubkeyword.PUBMED.name(),
citation.getPubMedId()));
}
// Remark
if (StringUtil.isSet(citation.getRemark()))
{
lines = StringUtil.lines(StringUtil.wrap(citation.getRemark(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-6.6s %s\n",
0 == i ? GenBankSubkeyword.REMARK.name() : "",
lines[i]));
}
}
}
}
//---------------------------------------------------------------------------
private void writeFeature(SeqFeature inSeqFeature, Writer inWriter)
throws IOException
{
String[] lines = StringUtil.lines(StringUtil.wrap(inSeqFeature.getLocation().toString(), 58));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-15.15s %s\n",
0 == i ? inSeqFeature.name() : "",
lines[i]));
}
if (CollectionUtil.hasValues(inSeqFeature.getQualifiers()))
{
for (FeatureQualifier qualifier : inSeqFeature.getQualifiers())
{
String qualifierString = "/" + qualifier.name();
if (StringUtil.isSet(qualifier.getValue()))
{
qualifierString += "=\"" + qualifier.getValue() + "\"";
}
lines = StringUtil.lines(StringUtil.wrap(qualifierString, 58));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %s\n", lines[i]));
}
}
}
}
//###########################################################################
// INNER CLASS
//###########################################################################
class GenBankSeqFilterReader extends LettersOnlyReader
{
//---------------------------------------------------------------------------
public GenBankSeqFilterReader(Reader inReader)
{
super(inReader);
}
//---------------------------------------------------------------------------
@Override
public int read()
throws IOException
{
int returnChar;
do
{
returnChar = innerRead();
}
while (returnChar >= 0
&& (Character.isWhitespace(returnChar)
|| Character.isDigit(returnChar)
|| returnChar == '/'));
return returnChar;
}
}
}