All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.format.GenBank Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.format;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.DbXref;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.Clone;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.bio.seq.SeqTopology;
import com.hfg.bio.seq.format.feature.FeatureQualifier;
import com.hfg.bio.seq.format.feature.SeqFeature;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.format.feature.genbank.*;
import com.hfg.bio.seq.format.feature.qualifier.MolType;
import com.hfg.bio.seq.format.genbank.GenBankKeyword;
import com.hfg.bio.seq.format.genbank.GenBankSubkeyword;
import com.hfg.bio.seq.format.genbank.InvalidGenBankKeywordException;
import com.hfg.bio.seq.format.genbank.InvalidGenBankSubkeywordException;
import com.hfg.bio.taxonomy.ncbi.NCBIGenBankDivision;
import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
import com.hfg.citation.Author;
import com.hfg.citation.CitationType;
import com.hfg.citation.Journal;
import com.hfg.citation.PatentData;
import com.hfg.datetime.DateUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.OrderedMap;
import com.hfg.util.io.LettersOnlyReader;

//------------------------------------------------------------------------------
/**
 GenBank sequence format.
 

See ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt

See http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html

@author J. Alex Taylor, hairyfatguy.com */ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class GenBank extends ReadableSeqFormatBase implements WritableSeqFormat { // Variables used during parsing private T mCurrentSeq; private GenBankKeyword mCurrentKeyword; private GenBankSubkeyword mCurrentSubkeyword; private GenBankFeature mCurrentFeature; private GenBankFeatureQualifier mCurrentFeatureQualifier; private SeqCitation mCurrentReference; private Integer mSeqLengthFromLocusLine; private SimpleDateFormat mDateFormat = new SimpleDateFormat("dd-MMM-yyyy"); private int mMaxExceptionsPerRecord = 0; // private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)\\s+(?:[\\-\\w]+)?(?:\\s+(\\w+))?\\s+(\\w{3})\\s+(\\S{11})"); // private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?"); private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s+(\\S+)?\\s+(?:\\S+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA|cRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?"); private static final Pattern sFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?"); private static final Pattern sReferenceLocationPattern = Pattern.compile("\\(bases (\\d+) to (\\d+)\\)"); private static final Pattern sReferencePatentPattern = Pattern.compile("Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?"); private static final Pattern sReferenceDirectSubmissionPattern = Pattern.compile("Submitted \\((\\d{2}-\\w{3}-(\\d{4}))\\)\\s+(.*)"); private static final Pattern sPatentLocationPattern = Pattern.compile(".+, \\w{2}"); private static final Pattern sPatentParensLocationPattern = Pattern.compile(".+ \\(\\w{2}\\)"); // Examples: // Thesis // Thesis (1996) Utrecht University, The Netherlands private static final Pattern sReferenceThesisPattern = Pattern.compile("Thesis(?: \\((\\d{4})\\)\\s+(.*))?"); // Examples: // Proc. Natl. Acad. Sci. U.S.A. 82 (3), 844-848 (1985) // Front Immunol 9, 1079 (2018) // Nat Commun (2018) In press // Dev. Comp. Immunol. 25 (5-6), 387-401 // J. Exp. Zool. 295B (1), 45-58 (2003) // PLoS ONE 8 (8), E70650 (2013) // Mol. Phylogenet. Evol. 94 (Pt B), 577-590 (2016) private static final Pattern sReferenceJournalPattern = Pattern.compile("(.+?)(?:\\s+(\\S+)(?:\\s+\\(([^\\)]+)\\))?,\\s+(\\d+(?:\\-\\d+)?|E\\d+))?(?:\\s+\\((\\d{4})\\))?(?: In press)?", Pattern.CASE_INSENSITIVE); private static final SimpleDateFormat sDateFormat = new SimpleDateFormat("dd-MMM-yyyy"); public static final String COMMENT_ATTR = "Comment"; public static final String NCBI_GI_ATTR = "NCBI GI"; public static final String CONTIG_ATTR = "Contig"; private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName()); static { LOGGER.setLevel(Level.WARNING); LOGGER.setUseParentHandlers(true); } //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- public GenBank(BioSequenceFactory inSeqFactory) { super(inSeqFactory); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public static Logger getLogger() { return LOGGER; } //--------------------------------------------------------------------------- public boolean isEndOfRecord(String inLine) { // Trying for something slightly more efficient than inLine.trim().equals("//") return inLine.startsWith("//") && 2 == inLine.trim().length(); } //--------------------------------------------------------------------------- public boolean hasJanusDelimiter() { return false; } //--------------------------------------------------------------------------- /** Specify the maximum number of Exceptions to tolerate per record. Defaults to zero. This mechanism will only work with sequences objects that implement the BioSequencePlus interface. If a record produces less than the specified maximum number of Exceptions, the Exceptions can be retrieved via the getParseExceptions() method on the BioSequencePlus sequence object. * @param inValue the maximum number of Exceptions to tolerate per record * @return this format object to facilitate method chaining. */ public GenBank setMaxExceptionsPerRecord(int inValue) { mMaxExceptionsPerRecord = inValue; return this; } //--------------------------------------------------------------------------- public T readRecord(BufferedReader inReader) throws SeqIOException { initRecordParsing(); int lineCount = 0; int maxPreLocusLines = 50; boolean locusLineFound = false; boolean originLineFound = false; mCurrentSeq = getBioSequenceFactory().createSeqObj(); try { String line; while ((line = inReader.readLine()) != null) { lineCount++; try { if (!locusLineFound) { if (lineCount > maxPreLocusLines) { throw new SeqFormatException("No GenBank " + GenBankKeyword.LOCUS + " line found within " + maxPreLocusLines + " lines of the start!"); } try { GenBankKeyword keyword = getLineKeyword(line); if (GenBankKeyword.LOCUS.equals(keyword)) { locusLineFound = true; } else { continue; } } catch (InvalidGenBankKeywordException e) { // Ignore continue; } } else if (isEndOfRecord(line)) { break; } if (locusLineFound && StringUtil.isSet(line)) { parseLine(line); if (GenBankKeyword.ORIGIN.equals(mCurrentKeyword)) { originLineFound = true; break; } } } catch(Exception e) { SeqIOException seqIOException = new SeqIOException("Problem parsing " + (StringUtil.isSet(mCurrentSeq.getID()) ? mCurrentSeq.getID() + " " : "") + "record line " + lineCount + " : " + StringUtil.singleQuote(line), e); if (mMaxExceptionsPerRecord > 0 && mCurrentSeq instanceof BioSequencePlus && (! ((BioSequencePlus) mCurrentSeq).hadParseExceptions() || ((BioSequencePlus) mCurrentSeq).getParseExceptions().size() < mMaxExceptionsPerRecord)) { ((BioSequencePlus) mCurrentSeq).addParseException(seqIOException); GenBank.getLogger().warning(e.getMessage()); } else { throw seqIOException; } } } if (! locusLineFound) { throw new SeqFormatException("No GenBank LOCUS line found!"); } if (originLineFound) { // The rest of the record is assumed to be sequence // Cleanup the sequence to remove spaces and numbers // Reader filterReader = new GenBankSeqFilterReader(inReader); // mCurrentSeq.setSequence(filterReader); // filterReader.close(); mCurrentSeq.setSequence(inReader); } inReader.close(); } catch (Exception e) { throw new SeqIOException("Problem parsing GenBank record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e); } if (! locusLineFound) { throw new SeqFormatException("No LOCUS line detected in the GenBank record!"); } return mCurrentSeq; } //--------------------------------------------------------------------------- public String write(Collection inSeqs) throws SeqIOException { StringWriter writer = new StringWriter(); for (T seq : inSeqs) { write(seq, writer); } return writer.toString(); } //--------------------------------------------------------------------------- public String write(T inSeq) throws SeqIOException { StringWriter writer = new StringWriter(); write(inSeq, writer); return writer.toString(); } //--------------------------------------------------------------------------- public void write(T inSeq, OutputStream inStream) throws SeqIOException { Writer writer = new OutputStreamWriter(inStream); write(inSeq, writer); try { writer.flush(); } catch (Exception e) { throw new SeqIOException(e); } } //--------------------------------------------------------------------------- public void write(T inSeq, Writer inWriter) throws SeqIOException { Reader seqReader = null; BufferedWriter writer = null; try { try { if (writer instanceof BufferedWriter) { writer = (BufferedWriter) inWriter; } else { writer = new BufferedWriter(inWriter, 8196); } // Write the LOCUS line writeLocus(inSeq, writer); // Write the DEFINTION line(s) writeDefinition(inSeq, writer); // Write the ACCESSION line writeAccession(inSeq, writer); // Write the VERSION line writeVersion(inSeq, writer); // TODO: SOURCE if (inSeq instanceof BioSequencePlus) { BioSequencePlus seqPlus = (BioSequencePlus) inSeq; if (CollectionUtil.hasValues(seqPlus.getDbXrefs())) { writeDBLinks(seqPlus.getDbXrefs(), writer); } if (CollectionUtil.hasValues(seqPlus.getReferences())) { writeReferences(seqPlus.getReferences(), writer); } // Write features if (CollectionUtil.hasValues(seqPlus.getFeatures())) { writer.write(GenBankKeyword.FEATURES + " Location/Qualifiers\n"); for (SeqFeature seqFeature : seqPlus.getFeatures()) { writeFeature(seqFeature, writer); } } } // Write the sequence lines writer.write(GenBankKeyword.ORIGIN + "\n"); seqReader = inSeq.getSequenceReader(); int bufferSize = 60; char[] buffer = new char[bufferSize]; int residueNum = 1; int numBytesRead; while ((numBytesRead = seqReader.read(buffer)) != -1) { if (numBytesRead < bufferSize) { int secondNumBytesRead = seqReader.read(buffer, numBytesRead, buffer.length - numBytesRead); if (secondNumBytesRead != -1) { numBytesRead += secondNumBytesRead; } } writer.write(String.format("%9d", residueNum)); for (int i = 0; i < numBytesRead; i += 10) { writer.write(" "); writer.write(buffer, i, i + 10 > numBytesRead ? numBytesRead - i: 10); } writer.write("\n"); residueNum += numBytesRead; } // Write end of record line writer.write("//\n"); } finally { if (seqReader != null) { seqReader.close(); } if (writer != null) { writer.flush(); } } } catch (SeqIOException e) { throw e; } catch (Exception e) { throw new SeqIOException(e); } } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private void initRecordParsing() { mCurrentSeq = null; mCurrentKeyword = null; mCurrentSubkeyword = null; mCurrentFeature = null; mCurrentFeatureQualifier = null; mCurrentReference = null; } //--------------------------------------------------------------------------- // A keyword starts in column one and has a maximum of 10 characters. private GenBankKeyword getLineKeyword(String inLine) { GenBankKeyword keyword = null; if (Character.isLetter(inLine.charAt(0))) { String keywordString = (inLine.length() > 11 ? inLine.substring(0, 11) : inLine).trim(); if (StringUtil.isSet(keywordString)) { keyword = GenBankKeyword.valueOf(keywordString); if (null == keyword) { throw new InvalidGenBankKeywordException(StringUtil.singleQuote(keywordString) + " is not a recognized GenBank keyword!"); } } } return keyword; } //--------------------------------------------------------------------------- // A subkeyword starts in column three and has a maximum of 8 characters. private GenBankSubkeyword getLineSubkeyword(String inLine) { GenBankSubkeyword subkeyword = null; if (Character.isWhitespace(inLine.charAt(0)) && Character.isWhitespace(inLine.charAt(1))) { String subkeywordString = (inLine.length() > 11 ? inLine.substring(2, 11) : inLine).trim(); if (StringUtil.isSet(subkeywordString)) { subkeyword = GenBankSubkeyword.valueOf(subkeywordString); if (null == subkeyword) { throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword!"); } else if (! mCurrentKeyword.allowsSubkeyword(subkeyword)) { throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword of " + mCurrentKeyword + "!"); } } } return subkeyword; } //--------------------------------------------------------------------------- private void finishPreviousKeyword() throws ParseException { if (GenBankKeyword.DEFINITION.equals(mCurrentKeyword)) { finishDefinition(); } else if (GenBankKeyword.REFERENCE.equals(mCurrentKeyword)) { finishReference(); } else if (mCurrentFeatureQualifier != null) { // The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES if (mCurrentFeatureQualifier.getValue().startsWith("\"")) { mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue())); } mCurrentFeatureQualifier = null; if (mCurrentSeq instanceof BioSequencePlus) { List sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source); if (CollectionUtil.hasValues(sourceFeatures)) { SeqFeature source = sourceFeatures.get(0); List molTypeQualifiers = source.getQualifiers(GenBankFeatureQualifierName.mol_type.name()); if (CollectionUtil.hasValues(molTypeQualifiers)) { MolType molType = MolType.valueOf(molTypeQualifiers.get(0).getValue()); if (molType != null) { ((BioSequencePlus) mCurrentSeq).setMolType(molType); } } // /db_xref="taxon:9606" List dbXrefQualifiers = source.getQualifiers(GenBankFeatureQualifierName.db_xref.name()); if (CollectionUtil.hasValues(dbXrefQualifiers)) { for (FeatureQualifier qualifier : dbXrefQualifiers) { String[] pieces = qualifier.getValue().split(":"); if (pieces[0].equals("taxon")) { ((BioSequencePlus) mCurrentSeq).setNCBITaxon(NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1]))); break; } } } List cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name()); if (CollectionUtil.hasValues(cloneQualifiers)) { Clone clone = new Clone(cloneQualifiers.get(0).getValue()); List subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name()); if (CollectionUtil.hasValues(subcloneQualifiers)) { clone.setSubcloneName(subcloneQualifiers.get(0).getValue()); } ((BioSequencePlus) mCurrentSeq).setClone(clone); } } } } } //--------------------------------------------------------------------------- private void parseLine(String inLine) throws Exception { GenBankKeyword keyword = getLineKeyword(inLine); if (keyword != null) { // Found the start of a new keyword field finishPreviousKeyword(); mCurrentKeyword = keyword; mCurrentSubkeyword = null; parseField(inLine); } else { // Continuation of an existing field if (GenBankKeyword.FEATURES.equals(mCurrentKeyword)) { // Features have a special set of feature keys parseFeatures(inLine); } else { GenBankSubkeyword subkeyword = getLineSubkeyword(inLine); if (subkeyword != null) { // Start of a new subfield mCurrentSubkeyword = subkeyword; } else { // Continuation of an existing subfield } parseField(inLine); } } } //--------------------------------------------------------------------------- private void parseField(String inLine) throws Exception { if (mCurrentKeyword.equals(GenBankKeyword.LOCUS)) { parseLocus(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.DEFINITION)) { parseDefinition(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.VERSION)) { parseVersion(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.KEYWORDS)) { parseKeywords(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.SOURCE)) { parseSource(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.REFERENCE)) { parseReference(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.COMMENT)) { parseComment(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.DBLINK)) { parseDBLink(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.FEATURES)) { parseFeatures(inLine); } else if (mCurrentKeyword.equals(GenBankKeyword.CONTIG)) { parseContig(inLine); } // NID is skipped // PROJECT is skipped // SEGMENT is skipped // BASE COUNT is skipped } //--------------------------------------------------------------------------- // Parse the LOCUS keyword line // Ex: // LOCUS R88064 460 bp mRNA linear EST 16-AUG-1995 // LOCUS pDR000029812 7616 bp circular // LOCUS vDR\365 8070 bp DNA circular 21-MAR-2011 // PairwiseSeqAligner aligner = new PairwiseSeqAligner(alignmentSettings); // Although it isn't always followed exactly, the detailed format for the LOCUS line format is as follows: // // Positions Contents // --------- -------- // 01-05 'LOCUS' // 06-12 spaces // 13-28 Locus name // 29-29 space // 30-40 Length of sequence, right-justified // 41-41 space // 42-43 bp // 44-44 space // 45-47 spaces, ss- (single-stranded), ds- (double-stranded), or // ms- (mixed-stranded) // 48-53 NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA), // mRNA (messenger RNA), uRNA (small nuclear RNA). // Left justified. // 54-55 space // 56-63 'linear' followed by two spaces, or 'circular' // 64-64 space // 65-67 The division code // 68-68 space // 69-79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) // private void parseLocus(String inLine) { Matcher m = sLocusPattern.matcher(inLine); if (! m.matches()) { throw new SeqFormatException("The " + GenBankKeyword.LOCUS + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!"); } if (StringUtil.isSet(m.group(1))) { mCurrentSeq.setID(m.group(1)); } mSeqLengthFromLocusLine = Integer.parseInt(m.group(2).trim()); // TODO: Group 3 is the detailed sequence type if (mCurrentSeq instanceof BioSequencePlus) { if (StringUtil.isSet(m.group(4))) { ((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(4))); } if (StringUtil.isSet(m.group(5))) { ((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(NCBIGenBankDivision.valueOf(m.group(5))); } if (StringUtil.isSet(m.group(6))) { try { ((BioSequencePlus) mCurrentSeq).setRevisionDate(mDateFormat.parse(m.group(6))); } catch (ParseException e) { // TODO: Add to warnings? System.err.println(e.getMessage()); } } } } //--------------------------------------------------------------------------- private String getAccession(T inSeq) { String acc = inSeq.getID(); if (StringUtil.isSet(acc)) { int index = acc.indexOf("."); if (index > 0) { acc = acc.substring(0, index); } } else { acc = ""; } return acc; } //--------------------------------------------------------------------------- private void writeLocus(T inSeq, Writer inWriter) throws IOException { inWriter.write(String.format("%5s %-16.16s %11d %2.2s", GenBankKeyword.LOCUS, getAccession(inSeq), inSeq.length(), inSeq.getType().equals(BioSequenceType.PROTEIN) ? "aa" : "bp")); if (inSeq instanceof BioSequencePlus) { BioSequencePlus bioSequencePlus = (BioSequencePlus) inSeq; String molTypeString = ""; if (bioSequencePlus.getMolType() != null) { if (bioSequencePlus.getMolType().equals(MolType.genomic_DNA) || bioSequencePlus.getMolType().equals(MolType.unassigned_DNA) || bioSequencePlus.getMolType().equals(MolType.other_DNA)) { molTypeString = "DNA"; } else if (bioSequencePlus.getMolType().equals(MolType.genomic_RNA) || bioSequencePlus.getMolType().equals(MolType.transcribed_RNA) || bioSequencePlus.getMolType().equals(MolType.unassigned_RNA) || bioSequencePlus.getMolType().equals(MolType.other_RNA)) { molTypeString = "RNA"; } else if (bioSequencePlus.getMolType().equals(MolType.mRNA)) { molTypeString = "mRNA"; } else if (bioSequencePlus.getMolType().equals(MolType.tRNA)) { molTypeString = "tRNA"; } } // TODO: 'ss-', 'ds-', or 'ms-' prefix for the mol type isn't parsed or output inWriter.write(String.format(" %-6.6s %-8.8s %-3.3s %s", molTypeString, bioSequencePlus.getSeqTopology() != null ? bioSequencePlus.getSeqTopology() : "", bioSequencePlus.getSeqRepositoryDivision() != null ? bioSequencePlus.getSeqRepositoryDivision().getCode() : "", bioSequencePlus.getRevisionDate() != null ? mDateFormat.format(bioSequencePlus.getRevisionDate()).toUpperCase() : "" )); } inWriter.write("\n"); } //--------------------------------------------------------------------------- // Parse the DEFINITION keyword line // Ex: // DEFINITION ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone // IMAGE:165908 5', mRNA sequence. // private void parseDefinition(String inLine) { String field = inLine.substring(12).trim(); if (null == mCurrentSeq.getDescription()) { mCurrentSeq.setDescription(field); } else { mCurrentSeq.setDescription(mCurrentSeq.getDescription() + " " + field); } } //--------------------------------------------------------------------------- // Remove the trailing period private void finishDefinition() { String definition = mCurrentSeq.getDescription(); if (definition != null && definition.endsWith(".")) { mCurrentSeq.setDescription(definition.substring(0, definition.length() - 1)); } } //--------------------------------------------------------------------------- private void writeDefinition(T inSeq, Writer inWriter) throws IOException { if (StringUtil.isSet(inSeq.getDescription())) { String description = inSeq.getDescription(); if (! description.endsWith(".")) { description += "."; } String[] lines = StringUtil.lines(StringUtil.wrap(description, 67)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format("%-10s %s\n", 0 == i ? GenBankKeyword.DEFINITION : "", lines[i])); } } } //--------------------------------------------------------------------------- private void writeAccession(T inSeq, Writer inWriter) throws IOException { if (StringUtil.isSet(inSeq.getID())) { inWriter.write(String.format("%-9.9s %s\n", GenBankKeyword.ACCESSION, getAccession(inSeq))); } } //--------------------------------------------------------------------------- private void writeVersion(T inSeq, Writer inWriter) throws IOException { if (StringUtil.isSet(inSeq.getID())) { inWriter.write(String.format("%-9.9s %s\n", GenBankKeyword.VERSION, inSeq.getID())); } } //--------------------------------------------------------------------------- // Parse the VERSION keyword line // Ex: // VERSION AF181452.1 GI:6017929 // ^^^^^^^^^^ ^^^^^^^^^^ // Compound NCBI GI // Accession Identifier // Number // private void parseVersion(String inLine) { String fields[] = inLine.substring(12).trim().split("\\s+"); mCurrentSeq.setID(fields[0]); if (fields.length > 1) { mCurrentSeq.setAttribute(NCBI_GI_ATTR, fields[1]); } } //--------------------------------------------------------------------------- // Parse the KEYWORDS keyword line // Ex: // KEYWORDS EST. // private void parseKeywords(String inLine) { String field = inLine.substring(12).trim(); if (field.endsWith(".")) { field = field.substring(0, field.length() - 1); } if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addKeywords(field.split(",\\s+")); } } //--------------------------------------------------------------------------- // Parse the SOURCE keyword line // Ex: // SOURCE Homo sapiens (human) // ORGANISM Homo sapiens // Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; // Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; // Catarrhini; Hominidae; Homo. // // The SOURCE field consists of two parts. The first part is found after // the SOURCE keyword and contains free-format information including an // abbreviated form of the organism name followed by a molecule type; // multiple lines are allowed, but the last line must end with a period. // The second part consists of information found after the ORGANISM // subkeyword. The formal scientific name for the source organism (genus // and species, where appropriate) is found on the same line as ORGANISM. // The records following the ORGANISM line list the taxonomic // classification levels, separated by semicolons and ending with a // period. // private void parseSource(String inLine) { if (GenBankSubkeyword.ORGANISM.equals(mCurrentSubkeyword)) { String field = inLine.substring(12).trim(); // For now, just keep the first line with the scientific name if (inLine.trim().startsWith(GenBankSubkeyword.ORGANISM.name()) && mCurrentSeq instanceof BioSequencePlus) { Set taxons = NCBITaxon.getByName(field); if (CollectionUtil.hasValues(taxons)) { NCBITaxon taxon; if (taxons.size() > 1) { // TODO: Refine with a better way to choose // Choose the one with highest id List sortedTaxons = new ArrayList<>(taxons); Collections.sort(sortedTaxons); taxon = sortedTaxons.get(sortedTaxons.size() - 1); } else { taxon = taxons.iterator().next(); } ((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon); } } } } //--------------------------------------------------------------------------- // Parse the REFERENCE keyword line // Ex: // REFERENCE 1 (bases 1 to 342) // AUTHORS Giachino,C., Padovan,E. and Lanzavecchia,A. // TITLE kappa+lambda+ dual receptor B cells are present in the human // peripheral repertoire // JOURNAL J. Exp. Med. 181 (3), 1245-1250 (1995) // PUBMED 7869042 // // Publications by the authors of the sequence that discuss the data reported in // the record. References are automatically sorted within the record based on date // of publication, showing the oldest references first. // // Some sequences have not been reported in papers and show a status of "unpublished" // or "in press". When an accession number and/or sequence data has appeared in print, // sequence authors should send the complete citation of the article to [email protected] // and the GenBank staff will revise the record. // // Various classes of publication can be present in the References field, including // journal article, book chapter, book, thesis/monograph, proceedings chapter, proceedings // from a meeting, and patent. // // The last citation in the REFERENCE field usually contains information about the // submitter of the sequence, rather than a literature citation. It is therefore // called the "submitter block" and shows the words "Direct Submission" instead of // an article title. Additional information is provided below, under the header Direct // Submission. Some older records do not contain a submitter block. private void parseReference(String inLine) throws ParseException { if (inLine.startsWith(GenBankKeyword.REFERENCE.name())) { // REFERENCE 2 (bases 1 to 200000) mCurrentReference = new SeqCitation(); // TODO: Set the reference seq location if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addReference(mCurrentReference); } Matcher m = sReferenceLocationPattern.matcher(inLine); if (m.find()) { mCurrentReference.setSeqLocation(new SeqLocation(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)))); } } else { String field = inLine.substring(12).trim(); if (GenBankSubkeyword.AUTHORS.equals(mCurrentSubkeyword)) { // Remove trailing comma if present. if (field.endsWith(",")) { field = field.substring(0, field.length() - 1); } else if (field.endsWith(" and")) { field = field.substring(0, field.length() - 4); } if (! field.equals(".")) { String[] authorStrings = field.split("(,\\s+|\\s+and\\s+)"); for (String authorString : authorStrings) { mCurrentReference.addAuthor(new Author(authorString)); } } } else if (GenBankSubkeyword.TITLE.equals(mCurrentSubkeyword)) { String title = field; if (mCurrentReference.getTitle() != null) { title = mCurrentReference.getTitle() + " " + title; } mCurrentReference.setTitle(title); } else if (GenBankSubkeyword.JOURNAL.equals(mCurrentSubkeyword)) { mCurrentReference.appendRawContent(field); String journal = field; if (mCurrentReference.getJournal() != null) { journal = mCurrentReference.getJournal().getTitle() + " " + journal; } mCurrentReference.setJournal(new Journal(journal)); } else if (GenBankSubkeyword.PUBMED.equals(mCurrentSubkeyword)) { mCurrentReference.setPubMedId(field); } else if (GenBankSubkeyword.REMARK.equals(mCurrentSubkeyword)) { String remark = field; if (mCurrentReference.getRemark() != null) { remark = mCurrentReference.getRemark() + " " + remark; } mCurrentReference.setRemark(remark); } } } //--------------------------------------------------------------------------- private void finishReference() throws ParseException { // Refine the citation based on the JOURNAL content if (null == mCurrentReference.getType() || mCurrentReference.getType().equals(CitationType.journal)) { // "Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?" Matcher m = sReferencePatentPattern.matcher(mCurrentReference.getJournal().getTitle()); if (m.matches()) { // It's not really a journal. It's a patent. mCurrentReference.setType(CitationType.patent); PatentData patentData = new PatentData() .setTitle(mCurrentReference.getTitle()) .setInventors(mCurrentReference.getAuthors()) .setPublicationNum(m.group(1)) .setSeqIdNum(Integer.parseInt(m.group(2))) .setPublicationDate(mDateFormat.parse(m.group(3))); if (m.group(4) != null) { patentData.setApplicants(parsePatentApplicants(m.group(4))); } mCurrentReference.setPatentData(patentData); mCurrentReference.setJournal(null); } else { // If it's a direct submission of sequences, extract the submission date and contact info m = sReferenceDirectSubmissionPattern.matcher(mCurrentReference.getJournal().getTitle()); if (m.matches()) { mCurrentReference.setSubmissionDate(DateUtil.threadsafeParse(m.group(1), sDateFormat)); mCurrentReference.setYear(Integer.parseInt(m.group(2))); mCurrentReference.setContactInfo(m.group(3)); mCurrentReference.setJournal(null); } else { // Is it a thesis? m = sReferenceThesisPattern.matcher(mCurrentReference.getJournal().getTitle()); if (m.matches()) { mCurrentReference.setType(CitationType.thesis); String yearString = m.group(1); if (yearString != null) { mCurrentReference.setYear(Integer.parseInt(yearString)); } mCurrentReference.setInstitution(m.group(2)); mCurrentReference.setJournal(null); } else if (mCurrentReference.getJournal().getTitle().startsWith("Published Only in Database")) { mCurrentReference.setType(CitationType.online_database); mCurrentReference.setJournal(null); } else { m = sReferenceJournalPattern.matcher(mCurrentReference.getJournal().getTitle()); if (m.matches()) { // OK, it's a journal. Fill out the rest of the journal-related fields. mCurrentReference.setType(CitationType.journal); mCurrentReference.setJournal(new Journal(m.group(1))); mCurrentReference.setVolume(m.group(2)); mCurrentReference.setIssue(m.group(3)); // Group 4 might be pages or an article number String pages = m.group(4); if (pages != null && pages.toUpperCase().startsWith("E")) { mCurrentReference.setArticleNumber(pages); } else { mCurrentReference.setPages(pages); } String year = m.group(5); if (year != null) { mCurrentReference.setYear(Integer.parseInt(year)); } } } } } } } //--------------------------------------------------------------------------- // Because of a lack of format controls, it's nearly impossible to parse this // content correctly. The best approach would be some sort of NLP. // This method is protected instead of private to allow unit testing. protected List parsePatentApplicants(String inApplicantString) { List applicants = new ArrayList<>(3); // Values should be separated by semi-colons if (inApplicantString.contains(";")) { String[] pieces = inApplicantString.split(";"); for (int i = 0; i < pieces.length; i++) { pieces[i] = pieces[i].trim(); } // Ends with a location? if (pieces.length > 1 && pieces.length <= 3 && (2 == pieces[pieces.length - 1].length() // Ends in a two letter country code? || sPatentLocationPattern.matcher(pieces[pieces.length - 1]).matches() // ', \\w{2}' || ! pieces[pieces.length - 1].contains(" "))) // Last piece is a single word (probably a city name) { applicants.add(StringUtil.join(pieces,", ")); } else { for (String piece : pieces) { applicants.add(piece); } } } else { // Sometimes the values are separated by commas String[] pieces = inApplicantString.split(","); for (String piece : pieces) { piece = piece.trim(); if (piece.length() > 0) { String ucPiece = piece.toUpperCase(); // Is it a single word or a company suffix? if (applicants.size() > 0 && (! piece.contains(" ") || ucPiece.startsWith("INC ") || ucPiece.startsWith("INC. ") || ucPiece.startsWith("LLC ") || (sPatentParensLocationPattern.matcher(ucPiece).matches() && applicants.size() > 0 && ! sPatentParensLocationPattern.matcher(applicants.get(applicants.size() - 1)).matches()))) { // Add it to the previous piece int lastIndex = applicants.size() - 1; applicants.set(lastIndex, applicants.get(lastIndex) + ", " + piece); } else { applicants.add(piece.trim()); } } } } return applicants; } //--------------------------------------------------------------------------- // Parse the COMMENT keyword line // Ex: // COMMENT Contact: Wilson RK // Washington University School of Medicine // 4444 Forest Park Parkway, Box 8501, St. Louis, MO 63108 // Tel: 314 286 1800 // Fax: 314 286 1810 // Email: [email protected] // Insert Size: 1482 // High quality sequence stops: 353 Source: IMAGE Consortium, LLNL // This clone is available royalty-free through LLNL ; contact the // IMAGE Consortium ([email protected]) for further information. // Insert Length: 1482 Std Error: 0.00 // Seq primer: M13RP1 // High quality sequence stop: 353. // private void parseComment(String inLine) { String field = inLine.substring(12).trim(); if (mCurrentSeq.getAttribute(COMMENT_ATTR) != null) { mCurrentSeq.setAttribute(COMMENT_ATTR, mCurrentSeq.getAttribute(COMMENT_ATTR) + "\n" + field); } else { mCurrentSeq.setAttribute(COMMENT_ATTR, field); } } //--------------------------------------------------------------------------- // Parse the DBLINK keyword line // Ex: // DBLINK BioProject:PRJNA174162,PRJNA999998,PRJNA999999 // BioSample: SAMN01795900 // // "This line contains cross-references to other underlying resources that // support the existence of a GenBank sequence record... // A DBLINK cross-reference consists of two data fields delimited by a colon. // The first field provides the cross-reference type ("BioProject"), while the // second contains the actual cross-reference identifier ("PRJNA177352"). // The second field can consist of multiple comma-separated identifiers, // if a sequence record has multiple DBLINK cross-references of a given type." // private void parseDBLink(String inLine) { if (mCurrentSeq instanceof BioSequencePlus) { BioSequencePlus sequencePlus = (BioSequencePlus) mCurrentSeq; String field = inLine.substring(12).trim(); String[] pieces = field.split(":"); if (2 == pieces.length) { String[] values = pieces[1].split(","); for (String value : values) { sequencePlus.addDbXref(new DbXref(pieces[0].trim(), value.trim())); } } else { // Continuation of previous db identifiers String db = sequencePlus.getDbXrefs().get(sequencePlus.getDbXrefs().size() - 1).getDB(); String[] values = field.split(","); for (String value : values) { sequencePlus.addDbXref(new DbXref(db, value.trim())); } } } } //--------------------------------------------------------------------------- private void writeDBLinks(List inDBXrefs, Writer inWriter) throws IOException { Map xRefMap = new OrderedMap<>(4); for (DbXref xref : inDBXrefs) { StringBuilderPlus line = xRefMap.get(xref.getDB()); if (null == line) { line = new StringBuilderPlus(xref.getDB() + ":" + xref.getId()); xRefMap.put(xref.getDB(), line); } else { line.delimitedAppend(xref.getId()); } } int count = 0; for (String db : xRefMap.keySet()) { String[] lines = StringUtil.lines(StringUtil.wrap(xRefMap.get(db).toString(), 67)); for (String line : lines) { count++; inWriter.write(String.format("%-12.12s%s", (1 == count ? GenBankKeyword.DBLINK : ""), line)); } } } //--------------------------------------------------------------------------- // Parse the CONTIG keyword line // Ex: // CONTIG join(D86993.1:7160..39752,D87004.2:803..13993) // private void parseContig(String inLine) { String field = inLine.substring(12).trim(); if (mCurrentSeq.getAttribute(CONTIG_ATTR) != null) { mCurrentSeq.setAttribute(CONTIG_ATTR, mCurrentSeq.getAttribute(CONTIG_ATTR) + field); } else { mCurrentSeq.setAttribute(CONTIG_ATTR, field); } } //--------------------------------------------------------------------------- private void parseFeatures(String inLine) { if (! inLine.startsWith(GenBankKeyword.FEATURES.name())) { // Is there a feature key on this line? String featureKeyString = inLine.substring(5, 20).trim(); if (StringUtil.isSet(featureKeyString)) { GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString); if (null == featureKey) { throw new SeqFormatException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!"); } String locationString = inLine.substring(21).trim(); mCurrentFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString)); if (mCurrentSeq instanceof BioSequencePlus) { ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentFeature); } // Unquote the previous qualifier if necessary if (mCurrentFeatureQualifier != null && mCurrentFeatureQualifier.getValue().startsWith("\"")) { mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue())); } mCurrentFeatureQualifier = null; } else { String content = inLine.substring(21).trim(); Matcher m = sFeatureQualifierPattern.matcher(content); if (m.matches()) { // New qualifier // Unquote the previous qualifier if necessary if (mCurrentFeatureQualifier != null && mCurrentFeatureQualifier.getValue().startsWith("\"")) { mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue())); } GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1)); if (null == qualifierName) { throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!"); } mCurrentFeatureQualifier = new GenBankFeatureQualifier(qualifierName); mCurrentFeature.addQualifier(mCurrentFeatureQualifier); String value = m.group(2); if (value != null) { mCurrentFeatureQualifier.appendToValue(value); } } else if (mCurrentFeatureQualifier != null) { // Continuation of a previous qualifier mCurrentFeatureQualifier.appendToValue(content); } else if (mCurrentFeature != null) { // Continuation of a feature location mCurrentFeature.getLocation().append(content); } } } } //--------------------------------------------------------------------------- private void writeReferences(List inSeqCitations, Writer inWriter) throws IOException { int count = 0; for (SeqCitation citation : inSeqCitations) { count++; inWriter.write(String.format("%s %-3d%s\n", GenBankKeyword.REFERENCE, count, citation.getSeqLocation() != null ? "(bases " + citation.getSeqLocation().getStart() + " to " + citation.getSeqLocation().getEnd() + ")" : "")); // Authors StringBuilderPlus authors = new StringBuilderPlus().setDelimiter(", "); for (int i = 0; i < citation.getAuthors().size(); i++) { Author author = citation.getAuthors().get(i); if (citation.getAuthors().size() > 1 && i == citation.getAuthors().size() - 1) { authors.append(" and " + author.getLastName() + "," + author.getFirstInitial() + "."); } else { authors.delimitedAppend(author.getLastName() + "," + author.getFirstInitial() + "."); } } String[] lines = StringUtil.lines(StringUtil.wrap(authors.toString(), 67)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format(" %-7.7s %s\n", 0 == i ? GenBankSubkeyword.AUTHORS.name() : "", lines[i])); } // Title lines = StringUtil.lines(StringUtil.wrap(citation.getTitle(), 67)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format(" %-5.5s %s\n", 0 == i ? GenBankSubkeyword.TITLE.name() : "", lines[i])); } // Journal // JOURNAL J. Exp. Med. 188 (11), 2151-2162 (1998) StringBuilderPlus journal = new StringBuilderPlus(); if (StringUtil.isSet(citation.toString())) { journal.append(citation.toString()); } else { journal.append(citation.getJournal()) .append(".") .append(citation.getVolume() != null ? " " + citation.getVolume() : "") .append(citation.getIssue() != null ? " (" + citation.getIssue() + ")" : "") .append(citation.getPages() != null ? ", " + citation.getPages() : "") .append(citation.getYear() != null ? " (" + citation.getYear() + ")" : ""); } lines = StringUtil.lines(StringUtil.wrap(journal.toString(), 67)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format(" %-7.7s %s\n", 0 == i ? GenBankSubkeyword.JOURNAL.name() : "", lines[i])); } // Pubmed id if (StringUtil.isSet(citation.getPubMedId())) { inWriter.write(String.format(" %-6.6s %s\n", GenBankSubkeyword.PUBMED.name(), citation.getPubMedId())); } // Remark if (StringUtil.isSet(citation.getRemark())) { lines = StringUtil.lines(StringUtil.wrap(citation.getRemark(), 67)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format(" %-6.6s %s\n", 0 == i ? GenBankSubkeyword.REMARK.name() : "", lines[i])); } } } } //--------------------------------------------------------------------------- private void writeFeature(SeqFeature inSeqFeature, Writer inWriter) throws IOException { String[] lines = StringUtil.lines(StringUtil.wrap(inSeqFeature.getLocation().toString(), 58)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format(" %-15.15s %s\n", 0 == i ? inSeqFeature.name() : "", lines[i])); } if (CollectionUtil.hasValues(inSeqFeature.getQualifiers())) { for (FeatureQualifier qualifier : inSeqFeature.getQualifiers()) { String qualifierString = "/" + qualifier.name(); if (StringUtil.isSet(qualifier.getValue())) { qualifierString += "=\"" + qualifier.getValue() + "\""; } lines = StringUtil.lines(StringUtil.wrap(qualifierString, 58)); for (int i = 0; i < lines.length; i++) { inWriter.write(String.format(" %s\n", lines[i])); } } } } //########################################################################### // INNER CLASS //########################################################################### class GenBankSeqFilterReader extends LettersOnlyReader { //--------------------------------------------------------------------------- public GenBankSeqFilterReader(Reader inReader) { super(inReader); } //--------------------------------------------------------------------------- @Override public int read() throws IOException { int returnChar; do { returnChar = innerRead(); } while (returnChar >= 0 && (Character.isWhitespace(returnChar) || Character.isDigit(returnChar) || returnChar == '/')); return returnChar; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy