
org.biojava.nbio.structure.io.FastaAFPChainConverter Maven / Gradle / Ivy
/**
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created on 2013-05-28
* Created by Douglas Myers-Turnbull
*
* @since 3.0.6
*/
package org.biojava.nbio.structure.io;
import org.biojava.nbio.core.alignment.template.AlignedSequence;
import org.biojava.nbio.core.alignment.template.SequencePair;
import org.biojava.nbio.structure.*;
import org.biojava.nbio.structure.align.model.AFPChain;
import org.biojava.nbio.structure.align.util.AlignmentTools;
import org.biojava.nbio.structure.align.xml.AFPChainXMLConverter;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.io.CasePreservingProteinSequenceCreator;
import org.biojava.nbio.core.sequence.io.FastaReader;
import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
import org.biojava.nbio.core.sequence.template.Sequence;
import org.biojava.nbio.core.util.SequenceTools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
/**
* A collection of static utilities to convert between {@link AFPChain AFPChains} and {@link FastaSequence FastaSequences}.
*
* @author dmyersturnbull
* @see StructureSequenceMatcher
* @see FastaStructureParser
* @see SeqRes2AtomAligner
*/
public class FastaAFPChainConverter {
private final static Logger logger = LoggerFactory.getLogger(FastaAFPChainConverter.class);
public static AFPChain cpFastaToAfpChain(String first, String second, Structure structure, int cpSite) throws StructureException, CompoundNotFoundException {
ProteinSequence s1 = new ProteinSequence(first);
s1.setUserCollection(getAlignedUserCollection(first));
ProteinSequence s2 = new ProteinSequence(second);
s2.setUserCollection(getAlignedUserCollection(second));
return cpFastaToAfpChain(s1, s2, structure, cpSite);
}
/**
* Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site
* {@link cpSite} residues to the right.
*
* @param fastaFile A FASTA file containing exactly 2 sequences, the first unpermuted and the second permuted
* @param cpSite
* The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence
* are acceptable
* @throws IOException
* @throws StructureException
*/
public static AFPChain cpFastaToAfpChain(File fastaFile, Structure structure, int cpSite) throws IOException, StructureException {
InputStream inStream = new FileInputStream(fastaFile);
SequenceCreatorInterface creator = new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet());
SequenceHeaderParserInterface headerParser = new GenericFastaHeaderParser();
FastaReader fastaReader = new FastaReader(inStream, headerParser, creator);
LinkedHashMap sequences = fastaReader.process();
inStream.close();
Iterator iter = sequences.values().iterator();
ProteinSequence first = iter.next();
ProteinSequence second = iter.next();
return cpFastaToAfpChain(first, second, structure, cpSite);
}
/**
* Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site
* {@link cpSite} residues to the right.
*
* @param first The unpermuted sequence
* @param second The sequence permuted by cpSite
* @param cpSite
* The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence
* are acceptable
* @throws StructureException
*/
public static AFPChain cpFastaToAfpChain(ProteinSequence first, ProteinSequence second, Structure structure, int cpSite)
throws StructureException {
if (structure == null) {
throw new IllegalArgumentException("The structure is null");
}
if (first == null) {
throw new IllegalArgumentException("The sequence is null");
}
// we need to find the ungapped CP site
int gappedCpShift = 0;
int ungappedCpShift = 0;
while (ungappedCpShift < Math.abs(cpSite)) {
char c;
try {
if (cpSite <= 0) {
c = second.getSequenceAsString().charAt(gappedCpShift);
} else {
c = second.getSequenceAsString().charAt(first.getLength()-1 - gappedCpShift);
}
} catch (StringIndexOutOfBoundsException e) {
throw new IllegalArgumentException("CP site of " + cpSite + " is wrong");
}
if (c != '-') {
ungappedCpShift++;
}
gappedCpShift++;
}
Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure);
Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure); // can't use cloneCAArray because it doesn't set parent group.chain.structure
ProteinSequence antipermuted = null;
try {
antipermuted = new ProteinSequence(SequenceTools.permuteCyclic(second.getSequenceAsString(), gappedCpShift));
} catch (CompoundNotFoundException e) {
// this can't happen, the original sequence comes from a ProteinSequence
logger.error("Unexpected error while creating protein sequence: {}. This is most likely a bug.",e.getMessage() );
}
ResidueNumber[] residues = StructureSequenceMatcher.matchSequenceToStructure(first, structure);
ResidueNumber[] antipermutedResidues = StructureSequenceMatcher.matchSequenceToStructure(antipermuted, structure);
ResidueNumber[] nonpermutedResidues = new ResidueNumber[antipermutedResidues.length];
SequenceTools.permuteCyclic(antipermutedResidues, nonpermutedResidues, -gappedCpShift);
// nullify ResidueNumbers that have a lowercase sequence character
if (first.getUserCollection() != null) {
CasePreservingProteinSequenceCreator.setLowercaseToNull(first, residues);
}
if (second.getUserCollection() != null) {
CasePreservingProteinSequenceCreator.setLowercaseToNull(second, nonpermutedResidues);
}
// for (int i = 0; i < residues.length; i++) {
// if (residues[i] == null) {
// System.out.print("=");
// } else {
// System.out.print(sequence.getSequenceAsString().charAt(i));
// }
// }
// System.out.println();
// for (int i = 0; i < residues.length; i++) {
// if (nonpermutedResidues[i] == null) {
// System.out.print("=");
// } else {
// System.out.print(second.getSequenceAsString().charAt(i));
// }
// }
// System.out.println();
return buildAlignment(ca1, ca2, residues, nonpermutedResidues);
}
/**
* Reads the file {@code fastaFile}, expecting exactly two sequences which give a pairwise alignment. Uses this and two structures to create an AFPChain corresponding to the alignment. Uses a
* {@link CasePreservingProteinSequenceCreator} and assumes that a residue is aligned if and only if it is given by an uppercase letter.
*
* @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
* @throws IOException
* @throws StructureException
*/
public static AFPChain fastaFileToAfpChain(File fastaFile, Structure structure1, Structure structure2)
throws IOException, StructureException {
InputStream inStream = new FileInputStream(fastaFile);
SequenceCreatorInterface creator = new CasePreservingProteinSequenceCreator(
AminoAcidCompoundSet.getAminoAcidCompoundSet());
SequenceHeaderParserInterface headerParser = new GenericFastaHeaderParser();
FastaReader fastaReader = new FastaReader(
inStream, headerParser, creator);
LinkedHashMap sequences = fastaReader.process();
inStream.close();
return fastaToAfpChain(sequences, structure1, structure2);
}
/**
* Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The
* sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures.
* @throws StructureException
* @throws CompoundNotFoundException
*/
public static AFPChain fastaStringToAfpChain(String sequence1, String sequence2, Structure structure1,
Structure structure2) throws StructureException, CompoundNotFoundException {
ProteinSequence seq1 = new ProteinSequence(sequence1);
ProteinSequence seq2 = new ProteinSequence(sequence2);
return fastaToAfpChain(seq1, seq2, structure1, structure2);
}
/**
* Uses two sequences each with a corresponding structure to create an AFPChain corresponding to the alignment. Provided only for convenience since FastaReaders return such maps.
*
* @param sequences
* A Map containing exactly two entries from sequence names as Strings to gapped ProteinSequences; the name is ignored
* @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
* @throws StructureException
*/
public static AFPChain fastaToAfpChain(Map sequences, Structure structure1,
Structure structure2) throws StructureException {
if (sequences.size() != 2) {
throw new IllegalArgumentException("There must be exactly 2 sequences, but there were " + sequences.size());
}
if (structure1 == null || structure2 == null) {
throw new IllegalArgumentException("A structure is null");
}
List seqs = new ArrayList();
List names = new ArrayList(2);
for (Map.Entry entry : sequences.entrySet()) {
seqs.add(entry.getValue());
names.add(entry.getKey());
}
return fastaToAfpChain(seqs.get(0), seqs.get(1), structure1, structure2);
}
/**
* TODO Write comment
* @param sequence1
* @param sequence2
* @param structure1
* @param structure2
* @return
* @throws StructureException
* @throws CompoundNotFoundException
*/
public static AFPChain fastaToAfpChain(String sequence1, String sequence2, Structure structure1,
Structure structure2) throws StructureException, CompoundNotFoundException {
ProteinSequence s1 = new ProteinSequence(sequence1);
s1.setUserCollection(getAlignedUserCollection(sequence1));
ProteinSequence s2 = new ProteinSequence(sequence2);
s2.setUserCollection(getAlignedUserCollection(sequence2));
return fastaToAfpChain(s1, s2, structure1, structure2);
}
/**
* Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The
* sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures. Assumes that a residue is aligned if and only if it is given by
* an uppercase letter.
* @param sequence1 Must have {@link ProteinSequence#getUserCollection()} set to document upper- and lower-case as aligned and unaligned; see {@link #getAlignedUserCollection(String)}
* @throws StructureException
*/
public static AFPChain fastaToAfpChain(ProteinSequence sequence1, ProteinSequence sequence2, Structure structure1,
Structure structure2) throws StructureException {
if (structure1 == null || structure2 == null) {
throw new IllegalArgumentException("A structure is null");
}
if (sequence1 == null || sequence2 == null) {
throw new IllegalArgumentException("A sequence is null");
}
Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure1);
Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure2);
ResidueNumber[] residues1 = StructureSequenceMatcher.matchSequenceToStructure(sequence1, structure1);
ResidueNumber[] residues2 = StructureSequenceMatcher.matchSequenceToStructure(sequence2, structure2);
// nullify ResidueNumbers that have a lowercase sequence character
if (sequence1.getUserCollection() != null) {
CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence1, residues1);
}
if (sequence2.getUserCollection() != null) {
CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence2, residues2);
}
return buildAlignment(ca1, ca2, residues1, residues2);
}
/**
* Provided only for convenience.
*
* @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
* @throws StructureException
*/
public static AFPChain fastaToAfpChain(SequencePair, AminoAcidCompound> alignment,
Structure structure1, Structure structure2) throws StructureException {
List, AminoAcidCompound>> seqs = alignment.getAlignedSequences();
StringBuilder sb1 = new StringBuilder();
for (AminoAcidCompound a : seqs.get(0)) {
sb1.append(a.getBase());
}
try {
ProteinSequence seq1 = new ProteinSequence(sb1.toString());
StringBuilder sb2 = new StringBuilder();
for (AminoAcidCompound a : seqs.get(1)) {
sb1.append(a.getBase());
}
ProteinSequence seq2 = new ProteinSequence(sb2.toString());
LinkedHashMap map = new LinkedHashMap();
map.put(structure1.getName(), seq1);
map.put(structure2.getName(), seq2);
return fastaToAfpChain(map, structure1, structure2);
} catch (CompoundNotFoundException e) {
logger.error("Unexpected error while creating protein sequences: {}. This is most likely a bug.",e.getMessage());
return null;
}
}
/**
* Builds an {@link AFPChain} from already-matched arrays of atoms and residues.
*
* @param ca1
* An array of atoms in the first structure
* @param ca2
* An array of atoms in the second structure
* @param residues1
* An array of {@link ResidueNumber ResidueNumbers} in the first structure that are aligned. Only null ResidueNumbers are considered to be unaligned
* @param residues2
* An array of {@link ResidueNumber ResidueNumbers} in the second structure that are aligned. Only null ResidueNumbers are considered to be unaligned
* @throws StructureException
*/
private static AFPChain buildAlignment(Atom[] ca1, Atom[] ca2, ResidueNumber[] residues1, ResidueNumber[] residues2)
throws StructureException {
// remove any gap
// this includes the ones introduced by the nullifying above
List alignedResiduesList1 = new ArrayList();
List alignedResiduesList2 = new ArrayList();
for (int i = 0; i < residues1.length; i++) {
if (residues1[i] != null && residues2[i] != null) {
alignedResiduesList1.add(residues1[i]);
alignedResiduesList2.add(residues2[i]);
}
}
ResidueNumber[] alignedResidues1 = alignedResiduesList1.toArray(new ResidueNumber[alignedResiduesList1.size()]);
ResidueNumber[] alignedResidues2 = alignedResiduesList2.toArray(new ResidueNumber[alignedResiduesList2.size()]);
AFPChain afpChain = AlignmentTools.createAFPChain(ca1, ca2, alignedResidues1, alignedResidues2);
afpChain.setAlgorithmName("unknown");
AlignmentTools.updateSuperposition(afpChain, ca1, ca2);
afpChain.setBlockSize(new int[] {afpChain.getNrEQR()});
afpChain.setBlockRmsd(new double[] {afpChain.getTotalRmsdOpt()});
afpChain.setBlockGap(new int[] {afpChain.getGapLen()});
return afpChain;
}
/**
* Takes a protein sequence string with capital and lowercase letters and sets its {@link ProteinSequence#getUserCollection() user collection} to record which letters are uppercase (aligned) and which are lowercase (unaligned).
* @param sequence Make sure not to use {@link ProteinSequence#getSequenceAsString()} for this, as it won't preserve upper- and lower-case
*/
public static List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy