com.hfg.bio.seq.format.GenBank Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.DbXref;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.Clone;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.bio.seq.SeqTopology;
import com.hfg.bio.seq.format.feature.FeatureQualifier;
import com.hfg.bio.seq.format.feature.SeqFeature;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.format.feature.genbank.*;
import com.hfg.bio.seq.format.feature.qualifier.MolType;
import com.hfg.bio.seq.format.genbank.GenBankKeyword;
import com.hfg.bio.seq.format.genbank.GenBankSubkeyword;
import com.hfg.bio.seq.format.genbank.InvalidGenBankKeywordException;
import com.hfg.bio.seq.format.genbank.InvalidGenBankSubkeywordException;
import com.hfg.bio.taxonomy.ncbi.NCBIGenBankDivision;
import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
import com.hfg.citation.Author;
import com.hfg.citation.CitationType;
import com.hfg.citation.Journal;
import com.hfg.citation.PatentData;
import com.hfg.datetime.DateUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.OrderedMap;
import com.hfg.util.io.LettersOnlyReader;
//------------------------------------------------------------------------------
/**
GenBank sequence format.
See ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt
See http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class GenBank extends ReadableSeqFormatBase implements WritableSeqFormat
{
// Variables used during parsing
private T mCurrentSeq;
private GenBankKeyword mCurrentKeyword;
private GenBankSubkeyword mCurrentSubkeyword;
private GenBankFeature mCurrentFeature;
private GenBankFeatureQualifier mCurrentFeatureQualifier;
private SeqCitation mCurrentReference;
private Integer mSeqLengthFromLocusLine;
private SimpleDateFormat mDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
private int mMaxExceptionsPerRecord = 0;
// private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)\\s+(?:[\\-\\w]+)?(?:\\s+(\\w+))?\\s+(\\w{3})\\s+(\\S{11})");
// private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?");
private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s+(\\S+)?\\s+(?:\\S+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA|cRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?");
private static final Pattern sFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?");
private static final Pattern sReferenceLocationPattern = Pattern.compile("\\(bases (\\d+) to (\\d+)\\)");
private static final Pattern sReferencePatentPattern = Pattern.compile("Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?");
private static final Pattern sReferenceDirectSubmissionPattern = Pattern.compile("Submitted \\((\\d{2}-\\w{3}-(\\d{4}))\\)\\s+(.*)");
private static final Pattern sPatentLocationPattern = Pattern.compile(".+, \\w{2}");
private static final Pattern sPatentParensLocationPattern = Pattern.compile(".+ \\(\\w{2}\\)");
// Examples:
// Thesis
// Thesis (1996) Utrecht University, The Netherlands
private static final Pattern sReferenceThesisPattern = Pattern.compile("Thesis(?: \\((\\d{4})\\)\\s+(.*))?");
// Examples:
// Proc. Natl. Acad. Sci. U.S.A. 82 (3), 844-848 (1985)
// Front Immunol 9, 1079 (2018)
// Nat Commun (2018) In press
// Dev. Comp. Immunol. 25 (5-6), 387-401
// J. Exp. Zool. 295B (1), 45-58 (2003)
// PLoS ONE 8 (8), E70650 (2013)
// Mol. Phylogenet. Evol. 94 (Pt B), 577-590 (2016)
private static final Pattern sReferenceJournalPattern = Pattern.compile("(.+?)(?:\\s+(\\S+)(?:\\s+\\(([^\\)]+)\\))?,\\s+(\\d+(?:\\-\\d+)?|E\\d+))?(?:\\s+\\((\\d{4})\\))?(?: In press)?", Pattern.CASE_INSENSITIVE);
private static final SimpleDateFormat sDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
public static final String COMMENT_ATTR = "Comment";
public static final String NCBI_GI_ATTR = "NCBI GI";
public static final String CONTIG_ATTR = "Contig";
private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName());
static
{
LOGGER.setLevel(Level.WARNING);
LOGGER.setUseParentHandlers(true);
}
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public GenBank(BioSequenceFactory inSeqFactory)
{
super(inSeqFactory);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public static Logger getLogger()
{
return LOGGER;
}
//---------------------------------------------------------------------------
public boolean isEndOfRecord(String inLine)
{
// Trying for something slightly more efficient than inLine.trim().equals("//")
return inLine.startsWith("//") && 2 == inLine.trim().length();
}
//---------------------------------------------------------------------------
public boolean hasJanusDelimiter()
{
return false;
}
//---------------------------------------------------------------------------
/**
Specify the maximum number of Exceptions to tolerate per record. Defaults to zero.
This mechanism will only work with sequences objects that implement the BioSequencePlus interface.
If a record produces less than the specified maximum number of Exceptions, the
Exceptions can be retrieved via the getParseExceptions() method on the
BioSequencePlus sequence object.
* @param inValue the maximum number of Exceptions to tolerate per record
* @return this format object to facilitate method chaining.
*/
public GenBank setMaxExceptionsPerRecord(int inValue)
{
mMaxExceptionsPerRecord = inValue;
return this;
}
//---------------------------------------------------------------------------
public T readRecord(BufferedReader inReader)
throws SeqIOException
{
initRecordParsing();
int lineCount = 0;
int maxPreLocusLines = 50;
boolean locusLineFound = false;
boolean originLineFound = false;
mCurrentSeq = getBioSequenceFactory().createSeqObj();
try
{
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
try
{
if (!locusLineFound)
{
if (lineCount > maxPreLocusLines)
{
throw new SeqFormatException("No GenBank " + GenBankKeyword.LOCUS
+ " line found within " + maxPreLocusLines
+ " lines of the start!");
}
try
{
GenBankKeyword keyword = getLineKeyword(line);
if (GenBankKeyword.LOCUS.equals(keyword))
{
locusLineFound = true;
}
else
{
continue;
}
}
catch (InvalidGenBankKeywordException e)
{
// Ignore
continue;
}
}
else if (isEndOfRecord(line))
{
break;
}
if (locusLineFound
&& StringUtil.isSet(line))
{
parseLine(line);
if (GenBankKeyword.ORIGIN.equals(mCurrentKeyword))
{
originLineFound = true;
break;
}
}
}
catch(Exception e)
{
SeqIOException seqIOException = new SeqIOException("Problem parsing "
+ (StringUtil.isSet(mCurrentSeq.getID()) ? mCurrentSeq.getID() + " " : "")
+ "record line " + lineCount + " : " + StringUtil.singleQuote(line), e);
if (mMaxExceptionsPerRecord > 0
&& mCurrentSeq instanceof BioSequencePlus
&& (! ((BioSequencePlus) mCurrentSeq).hadParseExceptions()
|| ((BioSequencePlus) mCurrentSeq).getParseExceptions().size() < mMaxExceptionsPerRecord))
{
((BioSequencePlus) mCurrentSeq).addParseException(seqIOException);
GenBank.getLogger().warning(e.getMessage());
}
else
{
throw seqIOException;
}
}
}
if (! locusLineFound)
{
throw new SeqFormatException("No GenBank LOCUS line found!");
}
if (originLineFound)
{
// The rest of the record is assumed to be sequence
// Cleanup the sequence to remove spaces and numbers
// Reader filterReader = new GenBankSeqFilterReader(inReader);
// mCurrentSeq.setSequence(filterReader);
// filterReader.close();
mCurrentSeq.setSequence(inReader);
}
inReader.close();
}
catch (Exception e)
{
throw new SeqIOException("Problem parsing GenBank record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e);
}
if (! locusLineFound)
{
throw new SeqFormatException("No LOCUS line detected in the GenBank record!");
}
return mCurrentSeq;
}
//---------------------------------------------------------------------------
public String write(Collection inSeqs)
throws SeqIOException
{
StringWriter writer = new StringWriter();
for (T seq : inSeqs)
{
write(seq, writer);
}
return writer.toString();
}
//---------------------------------------------------------------------------
public String write(T inSeq)
throws SeqIOException
{
StringWriter writer = new StringWriter();
write(inSeq, writer);
return writer.toString();
}
//---------------------------------------------------------------------------
public void write(T inSeq, OutputStream inStream)
throws SeqIOException
{
Writer writer = new OutputStreamWriter(inStream);
write(inSeq, writer);
try
{
writer.flush();
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//---------------------------------------------------------------------------
public void write(T inSeq, Writer inWriter)
throws SeqIOException
{
Reader seqReader = null;
BufferedWriter writer = null;
try
{
try
{
if (writer instanceof BufferedWriter)
{
writer = (BufferedWriter) inWriter;
} else
{
writer = new BufferedWriter(inWriter, 8196);
}
// Write the LOCUS line
writeLocus(inSeq, writer);
// Write the DEFINTION line(s)
writeDefinition(inSeq, writer);
// Write the ACCESSION line
writeAccession(inSeq, writer);
// Write the VERSION line
writeVersion(inSeq, writer);
// TODO: SOURCE
if (inSeq instanceof BioSequencePlus)
{
BioSequencePlus seqPlus = (BioSequencePlus) inSeq;
if (CollectionUtil.hasValues(seqPlus.getDbXrefs()))
{
writeDBLinks(seqPlus.getDbXrefs(), writer);
}
if (CollectionUtil.hasValues(seqPlus.getReferences()))
{
writeReferences(seqPlus.getReferences(), writer);
}
// Write features
if (CollectionUtil.hasValues(seqPlus.getFeatures()))
{
writer.write(GenBankKeyword.FEATURES + " Location/Qualifiers\n");
for (SeqFeature seqFeature : seqPlus.getFeatures())
{
writeFeature(seqFeature, writer);
}
}
}
// Write the sequence lines
writer.write(GenBankKeyword.ORIGIN + "\n");
seqReader = inSeq.getSequenceReader();
int bufferSize = 60;
char[] buffer = new char[bufferSize];
int residueNum = 1;
int numBytesRead;
while ((numBytesRead = seqReader.read(buffer)) != -1)
{
if (numBytesRead < bufferSize)
{
int secondNumBytesRead = seqReader.read(buffer, numBytesRead, buffer.length - numBytesRead);
if (secondNumBytesRead != -1)
{
numBytesRead += secondNumBytesRead;
}
}
writer.write(String.format("%9d", residueNum));
for (int i = 0; i < numBytesRead; i += 10)
{
writer.write(" ");
writer.write(buffer, i, i + 10 > numBytesRead ? numBytesRead - i: 10);
}
writer.write("\n");
residueNum += numBytesRead;
}
// Write end of record line
writer.write("//\n");
}
finally
{
if (seqReader != null)
{
seqReader.close();
}
if (writer != null)
{
writer.flush();
}
}
}
catch (SeqIOException e)
{
throw e;
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private void initRecordParsing()
{
mCurrentSeq = null;
mCurrentKeyword = null;
mCurrentSubkeyword = null;
mCurrentFeature = null;
mCurrentFeatureQualifier = null;
mCurrentReference = null;
}
//---------------------------------------------------------------------------
// A keyword starts in column one and has a maximum of 10 characters.
private GenBankKeyword getLineKeyword(String inLine)
{
GenBankKeyword keyword = null;
if (Character.isLetter(inLine.charAt(0)))
{
String keywordString = (inLine.length() > 11 ? inLine.substring(0, 11) : inLine).trim();
if (StringUtil.isSet(keywordString))
{
keyword = GenBankKeyword.valueOf(keywordString);
if (null == keyword)
{
throw new InvalidGenBankKeywordException(StringUtil.singleQuote(keywordString) + " is not a recognized GenBank keyword!");
}
}
}
return keyword;
}
//---------------------------------------------------------------------------
// A subkeyword starts in column three and has a maximum of 8 characters.
private GenBankSubkeyword getLineSubkeyword(String inLine)
{
GenBankSubkeyword subkeyword = null;
if (Character.isWhitespace(inLine.charAt(0))
&& Character.isWhitespace(inLine.charAt(1)))
{
String subkeywordString = (inLine.length() > 11 ? inLine.substring(2, 11) : inLine).trim();
if (StringUtil.isSet(subkeywordString))
{
subkeyword = GenBankSubkeyword.valueOf(subkeywordString);
if (null == subkeyword)
{
throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword!");
}
else if (! mCurrentKeyword.allowsSubkeyword(subkeyword))
{
throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword of " + mCurrentKeyword + "!");
}
}
}
return subkeyword;
}
//---------------------------------------------------------------------------
private void finishPreviousKeyword()
throws ParseException
{
if (GenBankKeyword.DEFINITION.equals(mCurrentKeyword))
{
finishDefinition();
}
else if (GenBankKeyword.REFERENCE.equals(mCurrentKeyword))
{
finishReference();
}
else if (mCurrentFeatureQualifier != null)
{
// The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES
if (mCurrentFeatureQualifier.getValue().startsWith("\""))
{
mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
}
mCurrentFeatureQualifier = null;
if (mCurrentSeq instanceof BioSequencePlus)
{
List sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source);
if (CollectionUtil.hasValues(sourceFeatures))
{
SeqFeature source = sourceFeatures.get(0);
List molTypeQualifiers = source.getQualifiers(GenBankFeatureQualifierName.mol_type.name());
if (CollectionUtil.hasValues(molTypeQualifiers))
{
MolType molType = MolType.valueOf(molTypeQualifiers.get(0).getValue());
if (molType != null)
{
((BioSequencePlus) mCurrentSeq).setMolType(molType);
}
}
// /db_xref="taxon:9606"
List dbXrefQualifiers = source.getQualifiers(GenBankFeatureQualifierName.db_xref.name());
if (CollectionUtil.hasValues(dbXrefQualifiers))
{
for (FeatureQualifier qualifier : dbXrefQualifiers)
{
String[] pieces = qualifier.getValue().split(":");
if (pieces[0].equals("taxon"))
{
((BioSequencePlus) mCurrentSeq).setNCBITaxon(NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1])));
break;
}
}
}
List cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name());
if (CollectionUtil.hasValues(cloneQualifiers))
{
Clone clone = new Clone(cloneQualifiers.get(0).getValue());
List subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name());
if (CollectionUtil.hasValues(subcloneQualifiers))
{
clone.setSubcloneName(subcloneQualifiers.get(0).getValue());
}
((BioSequencePlus) mCurrentSeq).setClone(clone);
}
}
}
}
}
//---------------------------------------------------------------------------
private void parseLine(String inLine)
throws Exception
{
GenBankKeyword keyword = getLineKeyword(inLine);
if (keyword != null)
{
// Found the start of a new keyword field
finishPreviousKeyword();
mCurrentKeyword = keyword;
mCurrentSubkeyword = null;
parseField(inLine);
}
else
{
// Continuation of an existing field
if (GenBankKeyword.FEATURES.equals(mCurrentKeyword))
{
// Features have a special set of feature keys
parseFeatures(inLine);
}
else
{
GenBankSubkeyword subkeyword = getLineSubkeyword(inLine);
if (subkeyword != null)
{
// Start of a new subfield
mCurrentSubkeyword = subkeyword;
}
else
{
// Continuation of an existing subfield
}
parseField(inLine);
}
}
}
//---------------------------------------------------------------------------
private void parseField(String inLine)
throws Exception
{
if (mCurrentKeyword.equals(GenBankKeyword.LOCUS))
{
parseLocus(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.DEFINITION))
{
parseDefinition(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.VERSION))
{
parseVersion(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.KEYWORDS))
{
parseKeywords(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.SOURCE))
{
parseSource(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.REFERENCE))
{
parseReference(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.COMMENT))
{
parseComment(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.DBLINK))
{
parseDBLink(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.FEATURES))
{
parseFeatures(inLine);
}
else if (mCurrentKeyword.equals(GenBankKeyword.CONTIG))
{
parseContig(inLine);
}
// NID is skipped
// PROJECT is skipped
// SEGMENT is skipped
// BASE COUNT is skipped
}
//---------------------------------------------------------------------------
// Parse the LOCUS keyword line
// Ex:
// LOCUS R88064 460 bp mRNA linear EST 16-AUG-1995
// LOCUS pDR000029812 7616 bp circular
// LOCUS vDR\365 8070 bp DNA circular 21-MAR-2011
// PairwiseSeqAligner aligner = new PairwiseSeqAligner(alignmentSettings);
// Although it isn't always followed exactly, the detailed format for the LOCUS line format is as follows:
//
// Positions Contents
// --------- --------
// 01-05 'LOCUS'
// 06-12 spaces
// 13-28 Locus name
// 29-29 space
// 30-40 Length of sequence, right-justified
// 41-41 space
// 42-43 bp
// 44-44 space
// 45-47 spaces, ss- (single-stranded), ds- (double-stranded), or
// ms- (mixed-stranded)
// 48-53 NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA),
// mRNA (messenger RNA), uRNA (small nuclear RNA).
// Left justified.
// 54-55 space
// 56-63 'linear' followed by two spaces, or 'circular'
// 64-64 space
// 65-67 The division code
// 68-68 space
// 69-79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
//
private void parseLocus(String inLine)
{
Matcher m = sLocusPattern.matcher(inLine);
if (! m.matches())
{
throw new SeqFormatException("The " + GenBankKeyword.LOCUS + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!");
}
if (StringUtil.isSet(m.group(1)))
{
mCurrentSeq.setID(m.group(1));
}
mSeqLengthFromLocusLine = Integer.parseInt(m.group(2).trim());
// TODO: Group 3 is the detailed sequence type
if (mCurrentSeq instanceof BioSequencePlus)
{
if (StringUtil.isSet(m.group(4)))
{
((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(4)));
}
if (StringUtil.isSet(m.group(5)))
{
((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(NCBIGenBankDivision.valueOf(m.group(5)));
}
if (StringUtil.isSet(m.group(6)))
{
try
{
((BioSequencePlus) mCurrentSeq).setRevisionDate(mDateFormat.parse(m.group(6)));
}
catch (ParseException e)
{
// TODO: Add to warnings?
System.err.println(e.getMessage());
}
}
}
}
//---------------------------------------------------------------------------
private String getAccession(T inSeq)
{
String acc = inSeq.getID();
if (StringUtil.isSet(acc))
{
int index = acc.indexOf(".");
if (index > 0)
{
acc = acc.substring(0, index);
}
}
else
{
acc = "";
}
return acc;
}
//---------------------------------------------------------------------------
private void writeLocus(T inSeq, Writer inWriter)
throws IOException
{
inWriter.write(String.format("%5s %-16.16s %11d %2.2s",
GenBankKeyword.LOCUS,
getAccession(inSeq),
inSeq.length(),
inSeq.getType().equals(BioSequenceType.PROTEIN) ? "aa" : "bp"));
if (inSeq instanceof BioSequencePlus)
{
BioSequencePlus bioSequencePlus = (BioSequencePlus) inSeq;
String molTypeString = "";
if (bioSequencePlus.getMolType() != null)
{
if (bioSequencePlus.getMolType().equals(MolType.genomic_DNA)
|| bioSequencePlus.getMolType().equals(MolType.unassigned_DNA)
|| bioSequencePlus.getMolType().equals(MolType.other_DNA))
{
molTypeString = "DNA";
}
else if (bioSequencePlus.getMolType().equals(MolType.genomic_RNA)
|| bioSequencePlus.getMolType().equals(MolType.transcribed_RNA)
|| bioSequencePlus.getMolType().equals(MolType.unassigned_RNA)
|| bioSequencePlus.getMolType().equals(MolType.other_RNA))
{
molTypeString = "RNA";
}
else if (bioSequencePlus.getMolType().equals(MolType.mRNA))
{
molTypeString = "mRNA";
}
else if (bioSequencePlus.getMolType().equals(MolType.tRNA))
{
molTypeString = "tRNA";
}
}
// TODO: 'ss-', 'ds-', or 'ms-' prefix for the mol type isn't parsed or output
inWriter.write(String.format(" %-6.6s %-8.8s %-3.3s %s",
molTypeString,
bioSequencePlus.getSeqTopology() != null ? bioSequencePlus.getSeqTopology() : "",
bioSequencePlus.getSeqRepositoryDivision() != null ? bioSequencePlus.getSeqRepositoryDivision().getCode() : "",
bioSequencePlus.getRevisionDate() != null ? mDateFormat.format(bioSequencePlus.getRevisionDate()).toUpperCase() : ""
));
}
inWriter.write("\n");
}
//---------------------------------------------------------------------------
// Parse the DEFINITION keyword line
// Ex:
// DEFINITION ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone
// IMAGE:165908 5', mRNA sequence.
//
private void parseDefinition(String inLine)
{
String field = inLine.substring(12).trim();
if (null == mCurrentSeq.getDescription())
{
mCurrentSeq.setDescription(field);
}
else
{
mCurrentSeq.setDescription(mCurrentSeq.getDescription() + " " + field);
}
}
//---------------------------------------------------------------------------
// Remove the trailing period
private void finishDefinition()
{
String definition = mCurrentSeq.getDescription();
if (definition != null
&& definition.endsWith("."))
{
mCurrentSeq.setDescription(definition.substring(0, definition.length() - 1));
}
}
//---------------------------------------------------------------------------
private void writeDefinition(T inSeq, Writer inWriter)
throws IOException
{
if (StringUtil.isSet(inSeq.getDescription()))
{
String description = inSeq.getDescription();
if (! description.endsWith("."))
{
description += ".";
}
String[] lines = StringUtil.lines(StringUtil.wrap(description, 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format("%-10s %s\n",
0 == i ? GenBankKeyword.DEFINITION : "",
lines[i]));
}
}
}
//---------------------------------------------------------------------------
private void writeAccession(T inSeq, Writer inWriter)
throws IOException
{
if (StringUtil.isSet(inSeq.getID()))
{
inWriter.write(String.format("%-9.9s %s\n",
GenBankKeyword.ACCESSION,
getAccession(inSeq)));
}
}
//---------------------------------------------------------------------------
private void writeVersion(T inSeq, Writer inWriter)
throws IOException
{
if (StringUtil.isSet(inSeq.getID()))
{
inWriter.write(String.format("%-9.9s %s\n",
GenBankKeyword.VERSION,
inSeq.getID()));
}
}
//---------------------------------------------------------------------------
// Parse the VERSION keyword line
// Ex:
// VERSION AF181452.1 GI:6017929
// ^^^^^^^^^^ ^^^^^^^^^^
// Compound NCBI GI
// Accession Identifier
// Number
//
private void parseVersion(String inLine)
{
String fields[] = inLine.substring(12).trim().split("\\s+");
mCurrentSeq.setID(fields[0]);
if (fields.length > 1)
{
mCurrentSeq.setAttribute(NCBI_GI_ATTR, fields[1]);
}
}
//---------------------------------------------------------------------------
// Parse the KEYWORDS keyword line
// Ex:
// KEYWORDS EST.
//
private void parseKeywords(String inLine)
{
String field = inLine.substring(12).trim();
if (field.endsWith("."))
{
field = field.substring(0, field.length() - 1);
}
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addKeywords(field.split(",\\s+"));
}
}
//---------------------------------------------------------------------------
// Parse the SOURCE keyword line
// Ex:
// SOURCE Homo sapiens (human)
// ORGANISM Homo sapiens
// Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
// Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
// Catarrhini; Hominidae; Homo.
//
// The SOURCE field consists of two parts. The first part is found after
// the SOURCE keyword and contains free-format information including an
// abbreviated form of the organism name followed by a molecule type;
// multiple lines are allowed, but the last line must end with a period.
// The second part consists of information found after the ORGANISM
// subkeyword. The formal scientific name for the source organism (genus
// and species, where appropriate) is found on the same line as ORGANISM.
// The records following the ORGANISM line list the taxonomic
// classification levels, separated by semicolons and ending with a
// period.
//
private void parseSource(String inLine)
{
if (GenBankSubkeyword.ORGANISM.equals(mCurrentSubkeyword))
{
String field = inLine.substring(12).trim();
// For now, just keep the first line with the scientific name
if (inLine.trim().startsWith(GenBankSubkeyword.ORGANISM.name())
&& mCurrentSeq instanceof BioSequencePlus)
{
Set taxons = NCBITaxon.getByName(field);
if (CollectionUtil.hasValues(taxons))
{
NCBITaxon taxon;
if (taxons.size() > 1)
{
// TODO: Refine with a better way to choose
// Choose the one with highest id
List sortedTaxons = new ArrayList<>(taxons);
Collections.sort(sortedTaxons);
taxon = sortedTaxons.get(sortedTaxons.size() - 1);
}
else
{
taxon = taxons.iterator().next();
}
((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon);
}
}
}
}
//---------------------------------------------------------------------------
// Parse the REFERENCE keyword line
// Ex:
// REFERENCE 1 (bases 1 to 342)
// AUTHORS Giachino,C., Padovan,E. and Lanzavecchia,A.
// TITLE kappa+lambda+ dual receptor B cells are present in the human
// peripheral repertoire
// JOURNAL J. Exp. Med. 181 (3), 1245-1250 (1995)
// PUBMED 7869042
//
// Publications by the authors of the sequence that discuss the data reported in
// the record. References are automatically sorted within the record based on date
// of publication, showing the oldest references first.
//
// Some sequences have not been reported in papers and show a status of "unpublished"
// or "in press". When an accession number and/or sequence data has appeared in print,
// sequence authors should send the complete citation of the article to [email protected]
// and the GenBank staff will revise the record.
//
// Various classes of publication can be present in the References field, including
// journal article, book chapter, book, thesis/monograph, proceedings chapter, proceedings
// from a meeting, and patent.
//
// The last citation in the REFERENCE field usually contains information about the
// submitter of the sequence, rather than a literature citation. It is therefore
// called the "submitter block" and shows the words "Direct Submission" instead of
// an article title. Additional information is provided below, under the header Direct
// Submission. Some older records do not contain a submitter block.
private void parseReference(String inLine)
throws ParseException
{
if (inLine.startsWith(GenBankKeyword.REFERENCE.name()))
{
// REFERENCE 2 (bases 1 to 200000)
mCurrentReference = new SeqCitation();
// TODO: Set the reference seq location
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addReference(mCurrentReference);
}
Matcher m = sReferenceLocationPattern.matcher(inLine);
if (m.find())
{
mCurrentReference.setSeqLocation(new SeqLocation(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))));
}
}
else
{
String field = inLine.substring(12).trim();
if (GenBankSubkeyword.AUTHORS.equals(mCurrentSubkeyword))
{
// Remove trailing comma if present.
if (field.endsWith(","))
{
field = field.substring(0, field.length() - 1);
}
else if (field.endsWith(" and"))
{
field = field.substring(0, field.length() - 4);
}
if (! field.equals("."))
{
String[] authorStrings = field.split("(,\\s+|\\s+and\\s+)");
for (String authorString : authorStrings)
{
mCurrentReference.addAuthor(new Author(authorString));
}
}
}
else if (GenBankSubkeyword.TITLE.equals(mCurrentSubkeyword))
{
String title = field;
if (mCurrentReference.getTitle() != null)
{
title = mCurrentReference.getTitle() + " " + title;
}
mCurrentReference.setTitle(title);
}
else if (GenBankSubkeyword.JOURNAL.equals(mCurrentSubkeyword))
{
mCurrentReference.appendRawContent(field);
String journal = field;
if (mCurrentReference.getJournal() != null)
{
journal = mCurrentReference.getJournal().getTitle() + " " + journal;
}
mCurrentReference.setJournal(new Journal(journal));
}
else if (GenBankSubkeyword.PUBMED.equals(mCurrentSubkeyword))
{
mCurrentReference.setPubMedId(field);
}
else if (GenBankSubkeyword.REMARK.equals(mCurrentSubkeyword))
{
String remark = field;
if (mCurrentReference.getRemark() != null)
{
remark = mCurrentReference.getRemark() + " " + remark;
}
mCurrentReference.setRemark(remark);
}
}
}
//---------------------------------------------------------------------------
private void finishReference()
throws ParseException
{
// Refine the citation based on the JOURNAL content
if (null == mCurrentReference.getType()
|| mCurrentReference.getType().equals(CitationType.journal))
{
// "Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?"
Matcher m = sReferencePatentPattern.matcher(mCurrentReference.getJournal().getTitle());
if (m.matches())
{
// It's not really a journal. It's a patent.
mCurrentReference.setType(CitationType.patent);
PatentData patentData = new PatentData()
.setTitle(mCurrentReference.getTitle())
.setInventors(mCurrentReference.getAuthors())
.setPublicationNum(m.group(1))
.setSeqIdNum(Integer.parseInt(m.group(2)))
.setPublicationDate(mDateFormat.parse(m.group(3)));
if (m.group(4) != null)
{
patentData.setApplicants(parsePatentApplicants(m.group(4)));
}
mCurrentReference.setPatentData(patentData);
mCurrentReference.setJournal(null);
}
else
{
// If it's a direct submission of sequences, extract the submission date and contact info
m = sReferenceDirectSubmissionPattern.matcher(mCurrentReference.getJournal().getTitle());
if (m.matches())
{
mCurrentReference.setSubmissionDate(DateUtil.threadsafeParse(m.group(1), sDateFormat));
mCurrentReference.setYear(Integer.parseInt(m.group(2)));
mCurrentReference.setContactInfo(m.group(3));
mCurrentReference.setJournal(null);
}
else
{
// Is it a thesis?
m = sReferenceThesisPattern.matcher(mCurrentReference.getJournal().getTitle());
if (m.matches())
{
mCurrentReference.setType(CitationType.thesis);
String yearString = m.group(1);
if (yearString != null)
{
mCurrentReference.setYear(Integer.parseInt(yearString));
}
mCurrentReference.setInstitution(m.group(2));
mCurrentReference.setJournal(null);
}
else if (mCurrentReference.getJournal().getTitle().startsWith("Published Only in Database"))
{
mCurrentReference.setType(CitationType.online_database);
mCurrentReference.setJournal(null);
}
else
{
m = sReferenceJournalPattern.matcher(mCurrentReference.getJournal().getTitle());
if (m.matches())
{
// OK, it's a journal. Fill out the rest of the journal-related fields.
mCurrentReference.setType(CitationType.journal);
mCurrentReference.setJournal(new Journal(m.group(1)));
mCurrentReference.setVolume(m.group(2));
mCurrentReference.setIssue(m.group(3));
// Group 4 might be pages or an article number
String pages = m.group(4);
if (pages != null
&& pages.toUpperCase().startsWith("E"))
{
mCurrentReference.setArticleNumber(pages);
}
else
{
mCurrentReference.setPages(pages);
}
String year = m.group(5);
if (year != null)
{
mCurrentReference.setYear(Integer.parseInt(year));
}
}
}
}
}
}
}
//---------------------------------------------------------------------------
// Because of a lack of format controls, it's nearly impossible to parse this
// content correctly. The best approach would be some sort of NLP.
// This method is protected instead of private to allow unit testing.
protected List parsePatentApplicants(String inApplicantString)
{
List applicants = new ArrayList<>(3);
// Values should be separated by semi-colons
if (inApplicantString.contains(";"))
{
String[] pieces = inApplicantString.split(";");
for (int i = 0; i < pieces.length; i++)
{
pieces[i] = pieces[i].trim();
}
// Ends with a location?
if (pieces.length > 1
&& pieces.length <= 3
&& (2 == pieces[pieces.length - 1].length() // Ends in a two letter country code?
|| sPatentLocationPattern.matcher(pieces[pieces.length - 1]).matches() // ', \\w{2}'
|| ! pieces[pieces.length - 1].contains(" "))) // Last piece is a single word (probably a city name)
{
applicants.add(StringUtil.join(pieces,", "));
}
else
{
for (String piece : pieces)
{
applicants.add(piece);
}
}
}
else
{ // Sometimes the values are separated by commas
String[] pieces = inApplicantString.split(",");
for (String piece : pieces)
{
piece = piece.trim();
if (piece.length() > 0)
{
String ucPiece = piece.toUpperCase();
// Is it a single word or a company suffix?
if (applicants.size() > 0
&& (! piece.contains(" ")
|| ucPiece.startsWith("INC ")
|| ucPiece.startsWith("INC. ")
|| ucPiece.startsWith("LLC ")
|| (sPatentParensLocationPattern.matcher(ucPiece).matches()
&& applicants.size() > 0
&& ! sPatentParensLocationPattern.matcher(applicants.get(applicants.size() - 1)).matches())))
{
// Add it to the previous piece
int lastIndex = applicants.size() - 1;
applicants.set(lastIndex, applicants.get(lastIndex) + ", " + piece);
}
else
{
applicants.add(piece.trim());
}
}
}
}
return applicants;
}
//---------------------------------------------------------------------------
// Parse the COMMENT keyword line
// Ex:
// COMMENT Contact: Wilson RK
// Washington University School of Medicine
// 4444 Forest Park Parkway, Box 8501, St. Louis, MO 63108
// Tel: 314 286 1800
// Fax: 314 286 1810
// Email: [email protected]
// Insert Size: 1482
// High quality sequence stops: 353 Source: IMAGE Consortium, LLNL
// This clone is available royalty-free through LLNL ; contact the
// IMAGE Consortium ([email protected]) for further information.
// Insert Length: 1482 Std Error: 0.00
// Seq primer: M13RP1
// High quality sequence stop: 353.
//
private void parseComment(String inLine)
{
String field = inLine.substring(12).trim();
if (mCurrentSeq.getAttribute(COMMENT_ATTR) != null)
{
mCurrentSeq.setAttribute(COMMENT_ATTR, mCurrentSeq.getAttribute(COMMENT_ATTR) + "\n" + field);
}
else
{
mCurrentSeq.setAttribute(COMMENT_ATTR, field);
}
}
//---------------------------------------------------------------------------
// Parse the DBLINK keyword line
// Ex:
// DBLINK BioProject:PRJNA174162,PRJNA999998,PRJNA999999
// BioSample: SAMN01795900
//
// "This line contains cross-references to other underlying resources that
// support the existence of a GenBank sequence record...
// A DBLINK cross-reference consists of two data fields delimited by a colon.
// The first field provides the cross-reference type ("BioProject"), while the
// second contains the actual cross-reference identifier ("PRJNA177352").
// The second field can consist of multiple comma-separated identifiers,
// if a sequence record has multiple DBLINK cross-references of a given type."
//
private void parseDBLink(String inLine)
{
if (mCurrentSeq instanceof BioSequencePlus)
{
BioSequencePlus sequencePlus = (BioSequencePlus) mCurrentSeq;
String field = inLine.substring(12).trim();
String[] pieces = field.split(":");
if (2 == pieces.length)
{
String[] values = pieces[1].split(",");
for (String value : values)
{
sequencePlus.addDbXref(new DbXref(pieces[0].trim(), value.trim()));
}
}
else
{
// Continuation of previous db identifiers
String db = sequencePlus.getDbXrefs().get(sequencePlus.getDbXrefs().size() - 1).getDB();
String[] values = field.split(",");
for (String value : values)
{
sequencePlus.addDbXref(new DbXref(db, value.trim()));
}
}
}
}
//---------------------------------------------------------------------------
private void writeDBLinks(List inDBXrefs, Writer inWriter)
throws IOException
{
Map xRefMap = new OrderedMap<>(4);
for (DbXref xref : inDBXrefs)
{
StringBuilderPlus line = xRefMap.get(xref.getDB());
if (null == line)
{
line = new StringBuilderPlus(xref.getDB() + ":" + xref.getId());
xRefMap.put(xref.getDB(), line);
}
else
{
line.delimitedAppend(xref.getId());
}
}
int count = 0;
for (String db : xRefMap.keySet())
{
String[] lines = StringUtil.lines(StringUtil.wrap(xRefMap.get(db).toString(), 67));
for (String line : lines)
{
count++;
inWriter.write(String.format("%-12.12s%s", (1 == count ? GenBankKeyword.DBLINK : ""), line));
}
}
}
//---------------------------------------------------------------------------
// Parse the CONTIG keyword line
// Ex:
// CONTIG join(D86993.1:7160..39752,D87004.2:803..13993)
//
private void parseContig(String inLine)
{
String field = inLine.substring(12).trim();
if (mCurrentSeq.getAttribute(CONTIG_ATTR) != null)
{
mCurrentSeq.setAttribute(CONTIG_ATTR, mCurrentSeq.getAttribute(CONTIG_ATTR) + field);
}
else
{
mCurrentSeq.setAttribute(CONTIG_ATTR, field);
}
}
//---------------------------------------------------------------------------
private void parseFeatures(String inLine)
{
if (! inLine.startsWith(GenBankKeyword.FEATURES.name()))
{
// Is there a feature key on this line?
String featureKeyString = inLine.substring(5, 20).trim();
if (StringUtil.isSet(featureKeyString))
{
GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString);
if (null == featureKey)
{
throw new SeqFormatException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!");
}
String locationString = inLine.substring(21).trim();
mCurrentFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString));
if (mCurrentSeq instanceof BioSequencePlus)
{
((BioSequencePlus) mCurrentSeq).addFeature(mCurrentFeature);
}
// Unquote the previous qualifier if necessary
if (mCurrentFeatureQualifier != null
&& mCurrentFeatureQualifier.getValue().startsWith("\""))
{
mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
}
mCurrentFeatureQualifier = null;
}
else
{
String content = inLine.substring(21).trim();
Matcher m = sFeatureQualifierPattern.matcher(content);
if (m.matches())
{
// New qualifier
// Unquote the previous qualifier if necessary
if (mCurrentFeatureQualifier != null
&& mCurrentFeatureQualifier.getValue().startsWith("\""))
{
mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
}
GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1));
if (null == qualifierName)
{
throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
}
mCurrentFeatureQualifier = new GenBankFeatureQualifier(qualifierName);
mCurrentFeature.addQualifier(mCurrentFeatureQualifier);
String value = m.group(2);
if (value != null)
{
mCurrentFeatureQualifier.appendToValue(value);
}
}
else if (mCurrentFeatureQualifier != null)
{
// Continuation of a previous qualifier
mCurrentFeatureQualifier.appendToValue(content);
}
else if (mCurrentFeature != null)
{
// Continuation of a feature location
mCurrentFeature.getLocation().append(content);
}
}
}
}
//---------------------------------------------------------------------------
private void writeReferences(List inSeqCitations, Writer inWriter)
throws IOException
{
int count = 0;
for (SeqCitation citation : inSeqCitations)
{
count++;
inWriter.write(String.format("%s %-3d%s\n",
GenBankKeyword.REFERENCE,
count,
citation.getSeqLocation() != null ? "(bases " + citation.getSeqLocation().getStart() + " to " + citation.getSeqLocation().getEnd() + ")" : ""));
// Authors
StringBuilderPlus authors = new StringBuilderPlus().setDelimiter(", ");
for (int i = 0; i < citation.getAuthors().size(); i++)
{
Author author = citation.getAuthors().get(i);
if (citation.getAuthors().size() > 1
&& i == citation.getAuthors().size() - 1)
{
authors.append(" and " + author.getLastName() + "," + author.getFirstInitial() + ".");
}
else
{
authors.delimitedAppend(author.getLastName() + "," + author.getFirstInitial() + ".");
}
}
String[] lines = StringUtil.lines(StringUtil.wrap(authors.toString(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-7.7s %s\n",
0 == i ? GenBankSubkeyword.AUTHORS.name() : "",
lines[i]));
}
// Title
lines = StringUtil.lines(StringUtil.wrap(citation.getTitle(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-5.5s %s\n",
0 == i ? GenBankSubkeyword.TITLE.name() : "",
lines[i]));
}
// Journal
// JOURNAL J. Exp. Med. 188 (11), 2151-2162 (1998)
StringBuilderPlus journal = new StringBuilderPlus();
if (StringUtil.isSet(citation.toString()))
{
journal.append(citation.toString());
}
else
{
journal.append(citation.getJournal())
.append(".")
.append(citation.getVolume() != null ? " " + citation.getVolume() : "")
.append(citation.getIssue() != null ? " (" + citation.getIssue() + ")" : "")
.append(citation.getPages() != null ? ", " + citation.getPages() : "")
.append(citation.getYear() != null ? " (" + citation.getYear() + ")" : "");
}
lines = StringUtil.lines(StringUtil.wrap(journal.toString(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-7.7s %s\n",
0 == i ? GenBankSubkeyword.JOURNAL.name() : "",
lines[i]));
}
// Pubmed id
if (StringUtil.isSet(citation.getPubMedId()))
{
inWriter.write(String.format(" %-6.6s %s\n",
GenBankSubkeyword.PUBMED.name(),
citation.getPubMedId()));
}
// Remark
if (StringUtil.isSet(citation.getRemark()))
{
lines = StringUtil.lines(StringUtil.wrap(citation.getRemark(), 67));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-6.6s %s\n",
0 == i ? GenBankSubkeyword.REMARK.name() : "",
lines[i]));
}
}
}
}
//---------------------------------------------------------------------------
private void writeFeature(SeqFeature inSeqFeature, Writer inWriter)
throws IOException
{
String[] lines = StringUtil.lines(StringUtil.wrap(inSeqFeature.getLocation().toString(), 58));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %-15.15s %s\n",
0 == i ? inSeqFeature.name() : "",
lines[i]));
}
if (CollectionUtil.hasValues(inSeqFeature.getQualifiers()))
{
for (FeatureQualifier qualifier : inSeqFeature.getQualifiers())
{
String qualifierString = "/" + qualifier.name();
if (StringUtil.isSet(qualifier.getValue()))
{
qualifierString += "=\"" + qualifier.getValue() + "\"";
}
lines = StringUtil.lines(StringUtil.wrap(qualifierString, 58));
for (int i = 0; i < lines.length; i++)
{
inWriter.write(String.format(" %s\n", lines[i]));
}
}
}
}
//###########################################################################
// INNER CLASS
//###########################################################################
class GenBankSeqFilterReader extends LettersOnlyReader
{
//---------------------------------------------------------------------------
public GenBankSeqFilterReader(Reader inReader)
{
super(inReader);
}
//---------------------------------------------------------------------------
@Override
public int read()
throws IOException
{
int returnChar;
do
{
returnChar = innerRead();
}
while (returnChar >= 0
&& (Character.isWhitespace(returnChar)
|| Character.isDigit(returnChar)
|| returnChar == '/'));
return returnChar;
}
}
}