org.biojava.nbio.structure.io.FastaStructureParser Maven / Gradle / Ivy
Show all versions of biojava-structure Show documentation
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure.io;
import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.io.FastaReader;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* Reads a protein sequence from a fasta file and attempts to match it to a
* 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in
* the output, allowing structural alignments to be read from fasta files.
*
* Structures are loaded from an AtomCache. For this to work, the accession
* for each protein should be parsed from the fasta header line into a form
* understood by {@link AtomCache#getStructure(String)}.
*
*
Lowercase letters are sometimes used to specify unaligned residues.
* This information can be preserved by using a CasePreservingSequenceCreator,
* which allows the case of residues to be accessed through the
* {@link ProteinSequence#getUserCollection()} method.
*
* @author Spencer Bliven
*
*/
public class FastaStructureParser {
// inputs
private FastaReader reader;
private AtomCache cache;
// cache processed data
private String[] accessions;
private ProteinSequence[] sequences;
private Structure[] structures;
private ResidueNumber[][] residues;
public FastaStructureParser(InputStream is,
SequenceHeaderParserInterface headerParser,
SequenceCreatorInterface sequenceCreator,
AtomCache cache)
{
this(new FastaReader(
is, headerParser, sequenceCreator),cache);
}
public FastaStructureParser(File file,
SequenceHeaderParserInterface headerParser,
SequenceCreatorInterface sequenceCreator,
AtomCache cache) throws FileNotFoundException
{
this(new FastaReader(
file, headerParser, sequenceCreator), cache);
}
public FastaStructureParser(FastaReader reader,
AtomCache cache) {
this.reader = reader;
this.cache = cache;
this.accessions = null;
this.sequences = null;
this.structures = null;
this.residues = null;
}
/**
* Parses the fasta file and loads it into memory.
*
* Information can be subsequently accessed through
* {@link #getSequences()},
* {@link #getStructures()},
* {@link #getResidues()}, and
* {@link #getAccessions()}.
*
* @throws IOException
* @throws StructureException
*/
public void process() throws IOException, StructureException {
if(sequences == null) { // only process once, then return cached values
Map sequenceMap = reader.process();
sequences = sequenceMap.values().toArray(new ProteinSequence[0]);
accessions = new String[sequences.length];
structures = new Structure[sequences.length];
residues = new ResidueNumber[sequences.length][];
// Match each sequence to a series of PDB Residue numbers
for(int i=0;i