All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.structure.io.FastaStructureParser Maven / Gradle / Ivy

There is a newer version: 7.1.3
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io;

import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.io.FastaReader;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.Map;


/**
 * Reads a protein sequence from a fasta file and attempts to match it to a
 * 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in
 * the output, allowing structural alignments to be read from fasta files.
 *
 * 

Structures are loaded from an AtomCache. For this to work, the accession * for each protein should be parsed from the fasta header line into a form * understood by {@link AtomCache#getStructure(String)}. * *

Lowercase letters are sometimes used to specify unaligned residues. * This information can be preserved by using a CasePreservingSequenceCreator, * which allows the case of residues to be accessed through the * {@link ProteinSequence#getUserCollection()} method. * * @author Spencer Bliven * */ public class FastaStructureParser { // inputs private FastaReader reader; private AtomCache cache; // cache processed data private String[] accessions; private ProteinSequence[] sequences; private Structure[] structures; private ResidueNumber[][] residues; public FastaStructureParser(InputStream is, SequenceHeaderParserInterface headerParser, SequenceCreatorInterface sequenceCreator, AtomCache cache) { this(new FastaReader( is, headerParser, sequenceCreator),cache); } public FastaStructureParser(File file, SequenceHeaderParserInterface headerParser, SequenceCreatorInterface sequenceCreator, AtomCache cache) throws FileNotFoundException { this(new FastaReader( file, headerParser, sequenceCreator), cache); } public FastaStructureParser(FastaReader reader, AtomCache cache) { this.reader = reader; this.cache = cache; this.accessions = null; this.sequences = null; this.structures = null; this.residues = null; } /** * Parses the fasta file and loads it into memory. * * Information can be subsequently accessed through * {@link #getSequences()}, * {@link #getStructures()}, * {@link #getResidues()}, and * {@link #getAccessions()}. * * @throws IOException * @throws StructureException */ public void process() throws IOException, StructureException { if(sequences == null) { // only process once, then return cached values Map sequenceMap = reader.process(); sequences = sequenceMap.values().toArray(new ProteinSequence[0]); accessions = new String[sequences.length]; structures = new Structure[sequences.length]; residues = new ResidueNumber[sequences.length][]; // Match each sequence to a series of PDB Residue numbers for(int i=0;i





© 2015 - 2024 Weber Informatics LLC | Privacy Policy