
org.biojava.nbio.structure.io.StructureSequenceMatcher Maven / Gradle / Ivy
/**
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created by Spencer Bliven
*
*/
package org.biojava.nbio.structure.io;
import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix;
import org.biojava.nbio.core.alignment.template.AlignedSequence;
import org.biojava.nbio.core.alignment.template.SequencePair;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.structure.*;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.template.CompoundSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStreamReader;
import java.util.*;
/**
* A utility class with methods for matching ProteinSequences with
* Structures.
* @author Spencer Bliven
*
*/
public class StructureSequenceMatcher {
private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class);
/**
* Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in
* {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty.
* The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not.
* @param sequence The input protein sequence
* @param wholeStructure The structure from which to take a substructure
* @return The resulting structure
* @throws StructureException
* @see {@link #matchSequenceToStructure(ProteinSequence, Structure)}
*/
public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) {
ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure);
Structure structure = wholeStructure.clone();
structure.setChains(new ArrayList<>());
// structure.getHetGroups().clear();
Chain currentChain = null;
for (ResidueNumber rn : rns) {
if (rn == null) continue;
Group group; // note that we don't clone
try {
group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn);
} catch (StructureException e) {
throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e);
}
Chain chain = new ChainImpl();
chain.setName(group.getChain().getName());
chain.setId(group.getChain().getId());
if (currentChain == null || !currentChain.getId().equals(chain.getId())) {
structure.addChain(chain);
chain.setEntityInfo(group.getChain().getEntityInfo());
chain.setStructure(structure);
chain.setId(group.getChain().getId());
chain.setName(group.getChain().getName());
currentChain = chain;
}
currentChain.addGroup(group);
}
return structure;
}
/**
* Generates a ProteinSequence corresponding to the sequence of struct,
* and maintains a mapping from the sequence back to the original groups.
*
* Chains are appended to one another. 'X' is used for heteroatoms.
*
* @param struct Input structure
* @param groupIndexPosition An empty map, which will be populated with
* (residue index in returned ProteinSequence) -> (Group within struct)
* @return A ProteinSequence with the full sequence of struct. Chains are
* concatenated in the same order as the input structures
*
* @see {@link SeqRes2AtomAligner#getFullAtomSequence(List, Map)}, which
* does the heavy lifting.
*
*/
public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map groupIndexPosition ) {
if( groupIndexPosition != null) {
groupIndexPosition.clear();
}
StringBuilder seqStr = new StringBuilder();
for(Chain chain : struct.getChains()) {
List groups = chain.getAtomGroups();
Map chainIndexPosition = new HashMap();
int prevLen = seqStr.length();
// get the sequence for this chain
String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false);
seqStr.append(chainSeq);
// fix up the position to include previous chains, and map the value back to a Group
for(Integer seqIndex : chainIndexPosition.keySet()) {
Integer groupIndex = chainIndexPosition.get(seqIndex);
groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex));
}
}
ProteinSequence s = null;
try {
s = new ProteinSequence(seqStr.toString());
} catch (CompoundNotFoundException e) {
// I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24
// we can log an error if it does, it would mean there's a bad bug somewhere
logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage());
}
return s;
}
/**
* Given a sequence and the corresponding Structure, get the ResidueNumber
* for each residue in the sequence.
*
* Smith-Waterman alignment is used to match the sequences. Residues
* in the sequence but not the structure or mismatched between sequence
* and structure will have a null atom, while
* residues in the structure but not the sequence are ignored with a warning.
* @param seq The protein sequence. Should match the sequence of struct very
* closely.
* @param struct The corresponding protein structure
* @return A list of ResidueNumbers of the same length as seq, containing
* either the corresponding residue or null.
*/
public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) {
//1. Create ProteinSequence for struct while remembering to which group each residue corresponds
Map atomIndexPosition = new HashMap();
ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition);
// TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which)
//2. Run Smith-Waterman to get the alignment
// Identity substitution matrix with +1 for match, -1 for mismatch
// TODO
SubstitutionMatrix matrix =
new SimpleSubstitutionMatrix(
AminoAcidCompoundSet.getAminoAcidCompoundSet(),
(short)1, (short)-1 );
matrix = new SimpleSubstitutionMatrix(
AminoAcidCompoundSet.getAminoAcidCompoundSet(),
new InputStreamReader(
SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")),
"blosum100");
SequencePair pair =
Alignments.getPairwiseAlignment(seq, structSeq,
PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix);
//System.out.print(pair.toString());
//3. Convert the alignment back to Atoms
AlignedSequence alignedSeq = pair.getQuery();
AlignedSequence alignedStruct = pair.getTarget();
assert(alignedSeq.getLength() == alignedStruct.getLength());
// System.out.println(pair.toString(80));
// System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(),
// alignedSeq.getLength()-alignedSeq.getNumGaps(),
// alignedStruct.getLength()-alignedStruct.getNumGaps());
ResidueNumber[] ca = new ResidueNumber[seq.getLength()];
for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed
//skip missing residues from sequence. These probably represent alignment errors
if(alignedSeq.isGap(pos)) {
int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;
assert(structIndex > 0);//should be defined since seq gap
Group g = atomIndexPosition.get(structIndex);
logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.",
g.getChainId(),
g.getResidueNumber().toString(),
g.getChain().getStructure().getPDBCode() );
continue;
}
if(! alignedStruct.isGap(pos) ) {
int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed
int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed
Group g = atomIndexPosition.get(structIndex);
assert(0<=seqIndex && seqIndex < ca.length);
ca[seqIndex] = g.getResidueNumber(); //remains null for gaps
}
}
return ca;
}
/**
* Removes all gaps ('-') from a protein sequence
* @param gapped
* @return
*/
public static ProteinSequence removeGaps(ProteinSequence gapped) {
final String[] gapStrings = {"-","."};
StringBuilder seq = new StringBuilder();
CompoundSet aaSet = gapped.getCompoundSet();
AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length];
for(int i=0;i T[][] removeGaps(final T[][] gapped) {
if(gapped == null ) return null;
if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length);
final int nProts = gapped.length;
final int protLen = gapped[0].length; // length of gapped proteins
// Verify that input is rectangular
for(int i=0;i