org.biojava.nbio.alignment.io.StockholmFileParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-alignment Show documentation
The newest version!
/*
 * BioJava development code
 *
 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
 * should be distributed with the code. If you do not have a copy, see:
 *
 * http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
 * at:
 *
 * http://www.biojava.org/
 *
 * Created on August 13, 2010 Author: Mark Chapman
 */

package org.biojava.nbio.alignment.io;

import org.biojava.nbio.alignment.io.StockholmFileAnnotation.StockholmFileAnnotationReference;
import org.biojava.nbio.core.exceptions.ParserException;
import org.biojava.nbio.core.util.InputStreamProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

/**
 * Stockholm file parser.

 * for more information about the format refer to
 * 
 * ftp://ftp.sanger.ac.uk/pub/databases
 * /Pfam/current_release/userman.txt.
 * ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT
 * /USERMAN.
 * http://sonnhammer.sbc.su.se/Stockholm.html.
 * 
 *
 *  * Pfam DESCRIPTION OF FIELDS
 *
 *    Compulsory fields:
 *    ------------------
 *
 *    AC   Accession number:           Accession number in form PFxxxxx.version or PBxxxxxx.
 *    ID   Identification:             One word name for family.
 *    DE   Definition:                 Short description of family.
 *    AU   Author:                     Authors of the entry.
 *    SE   Source of seed:             The source suggesting the seed members belong to one family.
 *    GA   Gathering method:           Search threshold to build the full alignment.
 *    TC   Trusted Cutoff:             Lowest sequence score and domain score of match in the full alignment.
 *    NC   Noise Cutoff:               Highest sequence score and domain score of match not in full alignment.
 *    TP   Type:                       Type of family -- presently Family, Domain, Motif or Repeat.
 *    SQ   Sequence:                   Number of sequences in alignment.
 *    //                               End of alignment.
 *
 *    Optional fields:
 *    ----------------
 *
 *    DC   Database Comment:           Comment about database reference.
 *    DR   Database Reference:         Reference to external database.
 *    RC   Reference Comment:          Comment about literature reference.
 *    RN   Reference Number:           Reference Number.
 *    RM   Reference Medline:          Eight digit medline UI number.
 *    RT   Reference Title:            Reference Title.
 *    RA   Reference Author:           Reference Author
 *    RL   Reference Location:         Journal location.
 *    PI   Previous identifier:        Record of all previous ID lines.
 *    KW   Keywords:                   Keywords.
 *    CC   Comment:                    Comments.
 *    NE   Pfam accession:             Indicates a nested domain.
 *    NL   Location:                   Location of nested domains - sequence ID, start and end of insert.
 *    WK   Wikipedia Reference:        Reference to wikipedia.
 *
 *    Obsolete fields:
 *    -----------
 *    AL   Alignment method of seed:   The method used to align the seed members.
 *    AM   Alignment Method:	    The order ls and fs hits are aligned to the model to build the full align.
 *
 * 
 *
 * @since 3.0.5
 * @author Amr ALHOSSARY
 * @author Marko Vaz
 *
 */
public class StockholmFileParser {

	private final static Logger logger = LoggerFactory.getLogger(StockholmFileParser.class);

	/** indicates reading as much as possible, without limits */
	public static final int INFINITY = -1;
	/** #=GF <feature> <Generic per-File annotation, free text> */
	private static final String GENERIC_PER_FILE_ANNOTATION = "GF";
	/** #=GC <feature> <Generic per-Column annotation, exactly 1 char per column> */
	private static final String GENERIC_PER_CONSENSUS_ANNOTATION = "GC";
	/** #=GS <seqname> <feature> <Generic per-Sequence annotation, free text> */
	private static final String GENERIC_PER_SEQUENCE_ANNOTATION = "GS";
	/** #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue> */
	private static final String GENERIC_PER_RESIDUE_ANNOTATION = "GR";

	// COMPULSORY FIELDS
	/** Accession number in form PFxxxxx (Pfam) or RFxxxxx (Rfam). */
	private static final String GF_ACCESSION_NUMBER = "AC";
	/** One word name for family. */
	private static final String GF_IDENTIFICATION = "ID";
	/** Short description of family. */
	private static final String GF_DEFINITION = "DE";
	/** Authors of the entry. */
	private static final String GF_AUTHOR = "AU";
	/**
	 * Indicates the order that ls and fs matches are aligned to the model to give the full alignment. (OBSOLETE IN
	 * HMMER3)
	 */
	private static final String GF_ALIGNMENT_METHOD = "AM";
	/** Command line used to generate the model */
	private static final String GF_BUILD_METHOD = "BM";
	/** Command line used to perform the search */
	private static final String GF_SEARCH_METHOD = "SM";
	/** The source suggesting the seed members belong to one family. */
	private static final String GF_SOURCE_SEED = "SE";
	/** The source (prediction or publication) of the consensus RNA secondary structure used by Rfam. */
	private static final String GF_SOURCE_STRUCTURE = "SS";
	/** Search threshold to build the full alignment. */
	private static final String GF_GATHERING_THRESHOLD = "GA";
	/** Lowest sequence score (and domain score for Pfam) of match in the full alignment. */
	private static final String GF_TRUSTED_CUTOFF = "TC";
	/** Highest sequence score (and domain score for Pfam) of match not in full alignment. */
	private static final String GF_NOISE_CUTOFF = "NC";
	/**
	 * Type of family -- presently Family, Domain, Motif or Repeat for Pfam. -- a tree with roots Gene, Intron or
	 * Cis-reg for Rfam.
	 */
	private static final String GF_TYPE_FIELD = "TP";
	/** Number of sequences in alignment, and start of MSA. */
	private static final String GF_SEQUENCE = "SQ";

	// OPTIONAL FIELDS

	/** Comment about database reference. */
	private static final String GF_DB_COMMENT = "DC";
	/** Reference to external database. */
	private static final String GF_DB_REFERENCE = "DR";
	/** Comment about literature reference. */
	private static final String GF_REFERENCE_COMMENT = "RC";
	/** Reference Number. */
	private static final String GF_REFERENCE_NUMBER = "RN";
	/** Eight digit medline UI number. */
	private static final String GF_REFERENCE_MEDLINE = "RM";
	/** Reference Title. */
	private static final String GF_REFERENCE_TITLE = "RT";
	/** Reference Author. */
	private static final String GF_REFERENCE_AUTHOR = "RA";
	/** Journal Location. */
	private static final String GF_REFERENCE_LOCALTION = "RL";
	/** Record of all previous ID lines. */
	private static final String GF_PREVIOUS_IDS = "PI";
	/** Keywords */
	private static final String GF_KEYWORDS = "KW";
	/** Comments */
	private static final String GF_COMMENT = "CC";
	/** Indicates a nested domain */
	private static final String GF_PFAM_ACCESSION = "NE";
	/** Location of nested domains - sequence ID, start and end of insert. */
	private static final String GF_LOCATION = "NL";
	/** Wikipedia page */
	private static final String GF_WIKIPEDIA_LINK = "WK";
	/** Clan accession */
	private static final String GF_CLAN = "CL";
	/** Used for listing Clan membership */
	private static final String GF_MEMBERSHIP = "MB";

	/** FOR EMBEDDING TREES **/

	/** A tree in New Hampshire eXtended format. */
	private static final String GF_NEW_HAMPSHIRE = "NH";
	/** A unique identifier for the next tree. */
	private static final String GF_TREE_ID = "TN";

	// OTHER

	/**
	 * A method used to set the bit score threshold based on the ratio of expected false positives to true positives.
	 * Floating point number between 0 and 1.
	 */
	private static final String GF_FALSE_DISCOVERY_RATE = "FR";

	// #=GS   

	private static final String GS_ACCESSION_NUMBER = "AC";
	private static final String GS_DESCRIPTION = "DE";
	private static final String GS_DATABASE_REFERENCE = "DR";
	private static final String GS_ORGANISM_SPECIES = "OS";
	private static final String GS_ORGANISM_CLASSIFICATION = "OC";
	private static final String GS_LOOK = "LO";

	// #=GR   

	/**
	 * For RNA [.,;<>(){}[]AaBb...],

	 * For protein [HGIEBTSCX]
	 */
	private static final String GR_SECONDARY_STRUCTURE = "SS";
	/**
	 * [0-9X]

	 * (0=0%-10%; ...; 9=90%-100%)
	 */
	private static final String GR_SURFACE_ACCESSIBILITY = "SA";

	/** [Mio] */
	private static final String GR_TRANS_MEMBRANE = "TM";
	/**
	 * [0-9*]

	 * (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)
	 */
	private static final String GR_POSTERIOR_PROBABILITY = "PP";
	/** [*] */
	private static final String GR_LIGAND_BINDING = "LI";
	/** [*] */
	private static final String GR_ACTIVE_SITE = "AS";
	/** [*] */
	private static final String GR_AS_PFAM_PREDICTED = "pAS";
	/** [*] */
	private static final String GR_AS_SWISSPROT = "sAS";
	/** [0-2] */
	private static final String GR_INTRON = "IN";

	// #=GC  

	private static final String GC_SEQUENSE_CONSENSUS = "seq_cons";
	private static final String GC_SECONDARY_STRUCTURE = "SS_cons";
	private static final String GC_SURFACE_ACCESSIBILITY = "SA_cons";
	private static final String GC_TRANS_MEMBRANE = "TM_cons";
	private static final String GC_POSTERIOR_PROBABILITY = "PP_cons";
	private static final String GC_LIGAND_BINDING = "LI_cons";
	private static final String GC_ACTIVE_SITE = "AS_cons";
	private static final String GC_AS_PFAM_PREDICTED = "pAS_cons";
	private static final String GC_AS_SWISSPROT = "sAS_cons";
	private static final String GC_INTRON = "IN_cons";
	/**
	 * Often the consensus RNA or protein sequence is used as a reference Any non-gap character (eg. x's) can indicate
	 * consensus/conserved/match columns .'s or -'s indicate insert columns ~'s indicate unaligned insertions Upper and
	 * lower case can be used to discriminate strong and weakly conserved residues respectively
	 */
	private static final String GC_REFERENCE_ANNOTATION = "RF";
	/**
	 * Indicates which columns in an alignment should be masked, such that the emission probabilities for match states
	 * corresponding to those columns will be the background distribution.
	 */
	private static final String GC_MODEL_MASK = "MM";

	private StockholmStructure stockholmStructure;
	// private boolean endFile = false;

	// private static final int STATUS_OUTSIDE_FILE = 0;
	// private static final int STATUS_INSIDE_FILE = 10;
	// private static final int STATUS_IN_SEQUENCE = 20;
	//
	// private int status=STATUS_OUTSIDE_FILE;

	Scanner internalScanner = null;
	private InputStream cashedInputStream;

	/**
	 * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.

	 * This function is meant to be used for single access to specific file and it closes the file after doing its
	 * assigned job. Any subsequent call to {@link #parseNext(int)} will throw an exception or will function with
	 * unpredicted behavior.
	 *
	 * @param filename
	 *            complete(?) path to the file from where to read the content
	 * @return stockholm file content
	 * @throws IOException
	 *             when an exception occurred while opening/reading/closing the file+
	 * @throws ParserException
	 *             if unexpected format is encountered
	 */
	public StockholmStructure parse(String filename) throws IOException {
		InputStream inStream = new InputStreamProvider().getInputStream(filename);
		StockholmStructure structure = parse(inStream);
		inStream.close();
		return structure;
	}

	/**
	 * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.

	 * This function doesn't close the file after doing its assigned job; to allow for further calls of
	 * {@link #parseNext(int)}.
	 *
	 * @see #parseNext(int)
	 *
	 * @param filename
	 *            file from where to read the content. see {@link InputStreamProvider} for more details.
	 * @param max
	 *            maximum number of files to read, {@link #INFINITY} for all.
	 * @return a vector of {@link StockholmStructure} containing parsed structures.
	 * @throws IOException
	 *             when an exception occurred while opening/reading/closing the file.
	 * @throws ParserException
	 *             if unexpected format is encountered
	 */
	public List parse(String filename, int max) throws IOException {
		InputStreamProvider isp = new InputStreamProvider();
		InputStream inStream = isp.getInputStream(filename);
		return parse(inStream, max);
	}

	/**
	 * parses {@link InputStream} and returns a the first contained alignment in a {@link StockholmStructure} object.
	 * Used mainly for multiple files within the same input stream, (e.g. when reading from Pfam flat files. 

	 * This method leaves the stream open for further calls of {@link #parseNext(int)}.
	 *
	 * @see #parseNext(int)
	 * @param inStream
	 *            the {@link InputStream} containing the file to read.
	 * @return a {@link StockholmStructure} object representing file contents.
	 * @throws IOException
	 * @throws ParserException
	 */
	public StockholmStructure parse(InputStream inStream) throws IOException {
		return parse(inStream, 1).get(0);
	}

	/**
	 * parses an {@link InputStream} and returns at maximum max objects contained in that file.

	 * This method leaves the stream open for further calls of {@link #parse(InputStream, int)} (same function) or
	 * {@link #parseNext(int)}.
	 *
	 * @see #parseNext(int)
	 * @param inStream
	 *            the stream to parse
	 * @param max
	 *            maximum number of structures to try to parse, {@link #INFINITY} to try to obtain as much as possible.
	 * @return a {@link List} of {@link StockholmStructure} objects. If there are no more structures, an empty list is
	 *         returned.
	 * @throws IOException
	 *             in case an I/O Exception occurred.
	 */
	public List parse(InputStream inStream, int max) throws IOException {
		if (max < INFINITY) {
			throw new IllegalArgumentException("max can't be -ve value " + max);
		}
		if (inStream != this.cashedInputStream) {
			this.cashedInputStream = inStream;
			this.internalScanner = null;
		}

		if (internalScanner == null) {
			internalScanner = new Scanner(inStream);
		}
		ArrayList structures = new ArrayList<>();
		while (max != INFINITY && max-- > 0) {
			StockholmStructure structure = parse(internalScanner);
			if (structure != null) {
				structures.add(structure);
			} else {
				break;
			}
		}
		return structures;
	}

	/**
	 * Tries to parse and return as maximum as max structures in the last used file or input stream.

	 * Please consider calling either {@link #parse(InputStream)}, {@link #parse(InputStream, int)}, or
	 * {@link #parse(String, int)} before calling this function.
	 *
	 * @param max
	 * @return
	 * @throws IOException
	 */
	public List parseNext(int max) throws IOException {
		return parse(this.cashedInputStream, max);
	}

	/**
	 * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content. This method returns
	 * just after reaching the end of structure delimiter line ("//"), leaving any remaining empty lines unconsumed.
	 *
	 * @param scanner
	 *            from where to read the file content
	 * @return Stockholm file content, null if couldn't or no more structures.
	 * @throws IOException
	 * @throws Exception
	 */
	StockholmStructure parse(Scanner scanner) throws IOException {
		if (scanner == null) {
			if (internalScanner != null) {
				scanner = internalScanner;
			} else {
				throw new IllegalArgumentException("No Scanner defined");
			}
		}
		String line = null;
		int linesCount = 0;
		try {
			while (scanner.hasNextLine()) {
				line = scanner.nextLine();
				// if the file is empty
				// this condition will not happen, just left in case we decided to go for buffereedReader again for
				// performance purpose.
				if (linesCount == 0 && line == null) {
					throw new IOException("Could not parse Stockholm file, BufferedReader returns null!");
				}

				// ignore empty lines
				if ((/* status==STATUS_INSIDE_FILE && */line == null) || line.trim().length() == 0) {
					continue;
				}

				if (line.startsWith("#=G")) {
					// // comment line or metadata
					// line = line.substring(1).trim();
					// line = line.substring(1).trim();
					if (line.startsWith(GENERIC_PER_FILE_ANNOTATION, 2)) {
						// #=GF  
						int firstSpaceIndex = line.indexOf(' ', 5);
						String featureName = line.substring(5, firstSpaceIndex);
						String value = line.substring(firstSpaceIndex).trim();
						handleFileAnnotation(featureName, value);
					} else if (line.startsWith(GENERIC_PER_CONSENSUS_ANNOTATION, 2)) {
						// Being in a consensus means we are no longer in a sequence.
						// this.status = STATUS_INSIDE_FILE;
						// #=GC  
						int firstSpaceIndex = line.indexOf(' ', 5);
						String featureName = line.substring(5, firstSpaceIndex);
						String value = line.substring(firstSpaceIndex).trim();
						handleConsensusAnnotation(featureName, value);
					} else if (line.startsWith(GENERIC_PER_SEQUENCE_ANNOTATION, 2)) {
						// #=GS   
						int index1 = line.indexOf(' ', 5);
						String seqName = line.substring(5, index1);
						while (line.charAt(++index1) <= ' ')
							// i.e. white space
							;// keep advancing
						int index2 = line.indexOf(' ', index1);
						String featureName = line.substring(index1, index2);
						String value = line.substring(index2).trim();
						handleSequenceAnnotation(seqName, featureName, value);
					} else if (line.startsWith(GENERIC_PER_RESIDUE_ANNOTATION, 2)) {
						// #=GR   
						int index1 = line.indexOf(' ', 5);
						String seqName = line.substring(5, index1);
						while (line.charAt(++index1) == ' ')
							;// keep advancing
						int index2 = line.indexOf(' ', index1);
						String featureName = line.substring(index1, index2);
						String value = line.substring(index2).trim();
						handleResidueAnnotation(seqName, featureName, value);
					}
				} else if (line.startsWith("# STOCKHOLM")) { // it is the header line
					// if (status == STATUS_OUTSIDE_FILE) {
					// status = STATUS_INSIDE_FILE;
					// String[] header = line.split("\\s+");
					// this.stockholmStructure = new StockholmStructure();
					// this.stockholmStructure.getFileAnnotation().setFormat(header[1]);
					// this.stockholmStructure.getFileAnnotation().setVersion(header[2]);
					// } else {
					// throw new ParserException("Uexpected Format line: [" + line + "]");
					// }
					String[] header = line.split("\\s+");
					this.stockholmStructure = new StockholmStructure();
					this.stockholmStructure.getFileAnnotation().setFormat(header[1]);
					this.stockholmStructure.getFileAnnotation().setVersion(header[2]);
				} else if ("//".equals(line.trim())) {
					// status = STATUS_OUTSIDE_FILE;
					break;// should we just break immediately or jump next empty lines?
				} else /* if (!line.startsWith("#")) */{
					// most probably This line corresponds to a sequence. Something like:
					// O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS
					// N.B. as long as we don't check the status now, it is somehow error prone
					handleSequenceLine(line);
					// //============removed status==========================
					// if (status == STATUS_IN_SEQUENCE) {
					// // This line corresponds to a sequence. Something like:
					// // O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS
					// handleSequenceLine(line);
					// // }else if (status==STATUS_OUTSIDE_FILE) {
					// // throw new
					// //
					// ParserException("The end of file character was allready reached but there are still sequence lines");
					// } else {
					// System.err.println("Error: Unknown or unexpected line [" + line
					// + "].\nPlease contact the Biojava team.");
					// throw new ParserException("Error: Unknown or unexpected line [" + line + "].");
					// }
					// //============removed status==========================
				}
				linesCount++;
			}
		} catch (IOException e) {
			// TODO: Best practice is to catch or throw Exception, never both
			logger.error("IOException: ", e);
			throw new IOException("Error parsing Stockholm file");
		}
		StockholmStructure structure = this.stockholmStructure;
		this.stockholmStructure = null;
		if (structure != null) {
			int length = -1;
			Map sequences = structure.getSequences();
			for (String sequencename : sequences.keySet()) {
				StringBuffer sequence = sequences.get(sequencename);
				if (length == -1) {
					length = sequence.length();
				} else if (length != sequence.length()) {
					throw new RuntimeException("Sequences have different lengths");
				}
			}
		}
		return structure;
	}

	/**
	 * Handles a line that corresponds to a sequence. 

	 * e.g.: COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA

	 * N.B.: This function can't tolerate sequences with intrinsic white space.
	 *
	 * @param line
	 *            the line to be parsed
	 * @throws Exception
	 */
	private void handleSequenceLine(String line) {
		String[] lineContent = line.split("\\s+");
		if (lineContent.length != 2) {
			throw new ParserException("Could not split sequence line into sequence name and sequence:\n" + line);
		}
		stockholmStructure.appendToSequence(lineContent[0], lineContent[1]);
	}

	/**
	 * #=GF <feature> <Generic per-File annotation, free text>
	 *
	 * @param featureName
	 * @param value
	 *            the line to be parsed
	 */
	private void handleFileAnnotation(String featureName, String value) {
		if (featureName.equals(GF_ACCESSION_NUMBER)) {
			stockholmStructure.getFileAnnotation().setGFAccessionNumber(value);
		} else if (featureName.equals(GF_IDENTIFICATION)) {
			stockholmStructure.getFileAnnotation().setGFIdentification(value);
		} else if (featureName.equals(GF_DB_REFERENCE)) {
			stockholmStructure.getFileAnnotation().addDBReference(value);
		} else if (featureName.equals(GF_DEFINITION)) {
			stockholmStructure.getFileAnnotation().setGFDefinition(value);
		} else if (featureName.equals(GF_AUTHOR)) {
			stockholmStructure.getFileAnnotation().setGFAuthors(value);
		} else if (featureName.equals(GF_ALIGNMENT_METHOD)) {
			stockholmStructure.getFileAnnotation().setAlignmentMethod(value);
		} else if (featureName.equals(GF_BUILD_METHOD)) {
			stockholmStructure.getFileAnnotation().addGFBuildMethod(value);
		} else if (featureName.equals(GF_SEARCH_METHOD)) {
			stockholmStructure.getFileAnnotation().setGFSearchMethod(value);
		} else if (featureName.equals(GF_SOURCE_SEED)) {
			stockholmStructure.getFileAnnotation().setGFSourceSeed(value);
		} else if (featureName.equals(GF_SOURCE_STRUCTURE)) {
			stockholmStructure.getFileAnnotation().setGFSourceStructure(value);
		} else if (featureName.equals(GF_GATHERING_THRESHOLD)) {
			stockholmStructure.getFileAnnotation().setGFGatheringThreshs(value);
		} else if (featureName.equals(GF_TRUSTED_CUTOFF)) {
			stockholmStructure.getFileAnnotation().setGFTrustedCutoffs(value);
		} else if (featureName.equals(GF_NOISE_CUTOFF)) {
			stockholmStructure.getFileAnnotation().setGFNoiseCutoffs(value);
		} else if (featureName.equals(GF_TYPE_FIELD)) {
			stockholmStructure.getFileAnnotation().setGFTypeField(value);
		} else if (featureName.equals(GF_PREVIOUS_IDS)) {
			stockholmStructure.getFileAnnotation().setGFPreviousIDs(value);
		} else if (featureName.equals(GF_SEQUENCE)) {
			// status = STATUS_IN_SEQUENCE;
			stockholmStructure.getFileAnnotation().setGFNumSequences(value);
		} else if (featureName.equals(GF_DB_COMMENT)) {
			stockholmStructure.getFileAnnotation().setGFDBComment(value);
			// } else if (featureName.equals(GF_DB_REFERENCE)) {
			// stockholmStructure.getFileAnnotation().addDBReference(value);
		} else if (featureName.equals(GF_REFERENCE_COMMENT)) {
			stockholmStructure.getFileAnnotation().setGFRefComment(value);
		} else if (featureName.equals(GF_REFERENCE_NUMBER)) {
			StockholmFileAnnotationReference reference = new StockholmFileAnnotationReference();
			stockholmStructure.getFileAnnotation().getReferences().add(reference);
		} else if (featureName.equals(GF_REFERENCE_MEDLINE)) {
			stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefMedline(value);
		} else if (featureName.equals(GF_REFERENCE_TITLE)) {
			stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefTitle(value);
		} else if (featureName.equals(GF_REFERENCE_AUTHOR)) {
			stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefAuthor(value);
		} else if (featureName.equals(GF_REFERENCE_LOCALTION)) {
			stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefLocation(value);
		} else if (featureName.equals(GF_KEYWORDS)) {
			stockholmStructure.getFileAnnotation().setGFKeywords(value);
		} else if (featureName.equals(GF_COMMENT)) {
			stockholmStructure.getFileAnnotation().addToGFComment(value);
		} else if (featureName.equals(GF_PFAM_ACCESSION)) {
			stockholmStructure.getFileAnnotation().setGFPfamAccession(value);
		} else if (featureName.equals(GF_LOCATION)) {
			stockholmStructure.getFileAnnotation().setGFLocation(value);
		} else if (featureName.equals(GF_WIKIPEDIA_LINK)) {
			stockholmStructure.getFileAnnotation().setGFWikipediaLink(value);
		} else if (featureName.equals(GF_CLAN)) {
			stockholmStructure.getFileAnnotation().setGFClan(value);
		} else if (featureName.equals(GF_MEMBERSHIP)) {
			stockholmStructure.getFileAnnotation().setGFMembership(value);
		} else if (featureName.equals(GF_NEW_HAMPSHIRE)) {
			stockholmStructure.getFileAnnotation().addGFNewHampshire(value);
		} else if (featureName.equals(GF_TREE_ID)) {
			stockholmStructure.getFileAnnotation().addGFTreeID(value);
		} else if (featureName.equals(GF_FALSE_DISCOVERY_RATE)) {
			stockholmStructure.getFileAnnotation().addGFFalseDiscoveryRate(value);
		} else {
			// unknown feature
			logger.warn("Unknown File Feature [{}].\nPlease contact the Biojava team.", featureName);
		}
	}

	/**
	 * usually a single line of:

	 * #=GC <feature> <Generic per-Column annotation, exactly 1 char per column>
	 *
	 * @param featureName
	 *            the feature name :)
	 * @param value
	 *            the line to be parsed.
	 */
	private void handleConsensusAnnotation(String featureName, String value) {
		if (featureName.equals(GC_SECONDARY_STRUCTURE)) {
			stockholmStructure.getConsAnnotation().setSecondaryStructure(value);
		} else if (featureName.equals(GC_SEQUENSE_CONSENSUS)) {
			stockholmStructure.getConsAnnotation().setSequenceConsensus(value);
		} else if (featureName.equals(GC_SURFACE_ACCESSIBILITY)) {
			stockholmStructure.getConsAnnotation().setSurfaceAccessibility(value);
		} else if (featureName.equals(GC_TRANS_MEMBRANE)) {
			stockholmStructure.getConsAnnotation().setTransMembrane(value);
		} else if (featureName.equals(GC_POSTERIOR_PROBABILITY)) {
			stockholmStructure.getConsAnnotation().setPosteriorProbability(value);
		} else if (featureName.equals(GC_LIGAND_BINDING)) {
			stockholmStructure.getConsAnnotation().setLigandBinding(value);
		} else if (featureName.equals(GC_ACTIVE_SITE)) {
			stockholmStructure.getConsAnnotation().setActiveSite(value);
		} else if (featureName.equals(GC_AS_PFAM_PREDICTED)) {
			stockholmStructure.getConsAnnotation().setAsPFamPredicted(value);
		} else if (featureName.equals(GC_AS_SWISSPROT)) {
			stockholmStructure.getConsAnnotation().setAsSwissProt(value);
		} else if (featureName.equals(GC_INTRON)) {
			stockholmStructure.getConsAnnotation().setIntron(value);
		} else if (featureName.equals(GC_REFERENCE_ANNOTATION)) {
			stockholmStructure.getConsAnnotation().setReferenceAnnotation(value);
		} else if (featureName.equals(GC_MODEL_MASK)) {
			stockholmStructure.getConsAnnotation().setModelMask(value);
		} else {
			// unknown feature
			logger.warn("Unknown Consensus Feature [{}].\nPlease contact the Biojava team.", featureName);
		}
	}

	/**
	 * #=GS <seqname> <feature> <Generic per-Sequence annotation, free text>
	 *
	 * @param line
	 *            the line to be parsed
	 */
	private void handleSequenceAnnotation(String seqName, String featureName, String value) {
		if (featureName.equals(GS_ACCESSION_NUMBER)) {
			stockholmStructure.addGSAccessionNumber(seqName, value);
		} else if (featureName.equals(GS_DESCRIPTION)) {
			stockholmStructure.addGSDescription(seqName, value);
		} else if (featureName.equals(GS_DATABASE_REFERENCE)) {
			stockholmStructure.addGSdbReference(seqName, value);
		} else if (featureName.equals(GS_ORGANISM_SPECIES)) {
			stockholmStructure.addGSOrganismSpecies(seqName, value);
		} else if (featureName.equals(GS_ORGANISM_CLASSIFICATION)) {
			stockholmStructure.addGSOrganismClassification(seqName, value);
		} else if (featureName.equals(GS_LOOK)) {
			stockholmStructure.addGSLook(seqName, value);
		} else {
			// unknown feature
			logger.warn("Unknown Sequence Feature [{}].\nPlease contact the Biojava team.", featureName);
		}
	}

	/**
	 * #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue>
	 *
	 * @param line
	 *            the line to be parsed
	 */
	private void handleResidueAnnotation(String seqName, String featureName, String value) {

		if (featureName.equals(GR_SURFACE_ACCESSIBILITY)) {
			stockholmStructure.addSurfaceAccessibility(seqName, value);
		} else if (featureName.equals(GR_TRANS_MEMBRANE)) {
			stockholmStructure.addTransMembrane(seqName, value);
		} else if (featureName.equals(GR_POSTERIOR_PROBABILITY)) {
			stockholmStructure.addPosteriorProbability(seqName, value);
		} else if (featureName.equals(GR_LIGAND_BINDING)) {
			stockholmStructure.addLigandBinding(seqName, value);
		} else if (featureName.equals(GR_ACTIVE_SITE)) {
			stockholmStructure.addActiveSite(seqName, value);
		} else if (featureName.equals(GR_AS_PFAM_PREDICTED)) {
			stockholmStructure.addASPFamPredicted(seqName, value);
		} else if (featureName.equals(GR_AS_SWISSPROT)) {
			stockholmStructure.addASSwissProt(seqName, value);
		} else if (featureName.equals(GR_INTRON)) {
			stockholmStructure.addIntron(seqName, value);
		} else if (featureName.equals(GR_SECONDARY_STRUCTURE)) {
			stockholmStructure.addSecondaryStructure(seqName, value);
		} else {
			// unknown feature
			logger.warn("Unknown Residue Feature [{}].\nPlease contact the Biojava team.", featureName);
		}
	}
}