
org.biojava.nbio.structure.io.cif.CifStructureConsumerImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
The newest version!
package org.biojava.nbio.structure.io.cif;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.stream.IntStream;
import javax.vecmath.Matrix4d;
import org.biojava.nbio.structure.AminoAcid;
import org.biojava.nbio.structure.AminoAcidImpl;
import org.biojava.nbio.structure.Atom;
import org.biojava.nbio.structure.AtomImpl;
import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.ChainImpl;
import org.biojava.nbio.structure.DBRef;
import org.biojava.nbio.structure.Element;
import org.biojava.nbio.structure.EntityInfo;
import org.biojava.nbio.structure.EntityType;
import org.biojava.nbio.structure.Group;
import org.biojava.nbio.structure.GroupType;
import org.biojava.nbio.structure.HetatomImpl;
import org.biojava.nbio.structure.NucleotideImpl;
import org.biojava.nbio.structure.PDBCrystallographicInfo;
import org.biojava.nbio.structure.PDBHeader;
import org.biojava.nbio.structure.PdbId;
import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.SeqMisMatch;
import org.biojava.nbio.structure.SeqMisMatchImpl;
import org.biojava.nbio.structure.Site;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureImpl;
import org.biojava.nbio.structure.StructureTools;
import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
import org.biojava.nbio.structure.io.BondMaker;
import org.biojava.nbio.structure.io.ChargeAdder;
import org.biojava.nbio.structure.io.EntityFinder;
import org.biojava.nbio.structure.io.FileParsingParameters;
import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
import org.biojava.nbio.structure.xtal.CrystalCell;
import org.biojava.nbio.structure.xtal.SpaceGroup;
import org.biojava.nbio.structure.xtal.SymoplibParser;
import org.rcsb.cif.model.FloatColumn;
import org.rcsb.cif.model.IntColumn;
import org.rcsb.cif.model.StrColumn;
import org.rcsb.cif.model.ValueKind;
import org.rcsb.cif.schema.mm.AtomSite;
import org.rcsb.cif.schema.mm.AtomSites;
import org.rcsb.cif.schema.mm.AuditAuthor;
import org.rcsb.cif.schema.mm.Cell;
import org.rcsb.cif.schema.mm.ChemComp;
import org.rcsb.cif.schema.mm.ChemCompBond;
import org.rcsb.cif.schema.mm.DatabasePDBRemark;
import org.rcsb.cif.schema.mm.DatabasePDBRev;
import org.rcsb.cif.schema.mm.DatabasePDBRevRecord;
import org.rcsb.cif.schema.mm.Em3dReconstruction;
import org.rcsb.cif.schema.mm.Entity;
import org.rcsb.cif.schema.mm.EntityPoly;
import org.rcsb.cif.schema.mm.EntityPolySeq;
import org.rcsb.cif.schema.mm.EntitySrcGen;
import org.rcsb.cif.schema.mm.EntitySrcNat;
import org.rcsb.cif.schema.mm.Exptl;
import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory;
import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier;
import org.rcsb.cif.schema.mm.PdbxDatabaseStatus;
import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor;
import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn;
import org.rcsb.cif.schema.mm.PdbxMolecule;
import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures;
import org.rcsb.cif.schema.mm.PdbxNonpolyScheme;
import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink;
import org.rcsb.cif.schema.mm.PdbxReferenceEntityList;
import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink;
import org.rcsb.cif.schema.mm.PdbxStructAssembly;
import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen;
import org.rcsb.cif.schema.mm.PdbxStructModResidue;
import org.rcsb.cif.schema.mm.PdbxStructOperList;
import org.rcsb.cif.schema.mm.Refine;
import org.rcsb.cif.schema.mm.Struct;
import org.rcsb.cif.schema.mm.StructAsym;
import org.rcsb.cif.schema.mm.StructConf;
import org.rcsb.cif.schema.mm.StructConn;
import org.rcsb.cif.schema.mm.StructConnType;
import org.rcsb.cif.schema.mm.StructKeywords;
import org.rcsb.cif.schema.mm.StructNcsOper;
import org.rcsb.cif.schema.mm.StructRef;
import org.rcsb.cif.schema.mm.StructRefSeq;
import org.rcsb.cif.schema.mm.StructRefSeqDif;
import org.rcsb.cif.schema.mm.StructSheetRange;
import org.rcsb.cif.schema.mm.StructSite;
import org.rcsb.cif.schema.mm.StructSiteGen;
import org.rcsb.cif.schema.mm.Symmetry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and
* use it to build up a {@link Structure} object.
* @author Sebastian Bittrich
* @since 6.0.0
*/
public class CifStructureConsumerImpl implements CifStructureConsumer {
private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class);
private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("yyyy-MM-dd")
.toFormatter(Locale.US);
private Structure structure;
private Chain currentChain;
private Group currentGroup;
private List> allModels;
private List currentModel;
private PDBHeader pdbHeader;
private String currentNmrModelNumber;
private Em3dReconstruction em3dReconstruction;
private List entityChains;
private Entity entity;
private EntityPoly entityPoly;
private EntitySrcGen entitySrcGen;
private EntitySrcNat entitySrcNat;
private PdbxEntitySrcSyn entitySrcSyn;
private List seqResChains;
private PdbxStructAssembly structAssembly;
private PdbxStructAssemblyGen structAssemblyGen;
private StructAsym structAsym;
private StructConn structConn;
private StructNcsOper structNcsOper;
private PdbxStructOperList structOpers;
private StructRef structRef;
private StructRefSeqDif structRefSeqDif;
private StructSiteGen structSiteGen;
private Map asymId2entityId;
private Map asymId2authorId;
private Matrix4d parsedScaleMatrix;
private final FileParsingParameters params;
public CifStructureConsumerImpl(FileParsingParameters params) {
this.params = params;
}
@Override
public void prepare() {
this.structure = new StructureImpl();
this.pdbHeader = new PDBHeader();
structure.setPDBHeader(pdbHeader);
this.allModels = new ArrayList<>();
this.currentModel = new ArrayList<>();
this.seqResChains = new ArrayList<>();
this.asymId2entityId = new HashMap<>();
this.asymId2authorId = new HashMap<>();
this.entityChains = new ArrayList<>();
}
@Override
public void consumeAtomSite(AtomSite atomSite) {
if (params.isHeaderOnly()) {
return;
}
StrColumn labelAsymId = atomSite.getLabelAsymId();
StrColumn authAsymId = atomSite.getAuthAsymId();
StrColumn groupPDB = atomSite.getGroupPDB();
IntColumn authSeqId = atomSite.getAuthSeqId();
StrColumn labelCompId = atomSite.getLabelCompId();
IntColumn id = atomSite.getId();
StrColumn labelAtomId = atomSite.getLabelAtomId();
FloatColumn cartnX = atomSite.getCartnX();
FloatColumn cartnY = atomSite.getCartnY();
FloatColumn cartnZ = atomSite.getCartnZ();
FloatColumn occupancy = atomSite.getOccupancy();
FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv();
StrColumn labelAltId = atomSite.getLabelAltId();
StrColumn typeSymbol = atomSite.getTypeSymbol();
StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode();
IntColumn labelSeqId = atomSite.getLabelSeqId();
IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum();
for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) {
boolean startOfNewChain = false;
Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex));
boolean isHetAtmInFile = false;
if (!"ATOM".equals(groupPDB.get(atomIndex))) {
if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
oneLetterCode = null;
}
isHetAtmInFile = true;
}
String insCodeString = pdbxPDBInsCode.isDefined()? pdbxPDBInsCode.get(atomIndex) : null;
Character insCode = null;
if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) {
insCode = insCodeString.charAt(0);
}
// non polymer chains (ligands and small molecules) will have a label_seq_id set to '.'
long seqId = labelSeqId.get(atomIndex);
String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex);
if (currentNmrModelNumber == null) {
currentNmrModelNumber = nmrModelNumber;
}
if (!currentNmrModelNumber.equals(nmrModelNumber)) {
currentNmrModelNumber = nmrModelNumber;
if (currentChain != null) {
currentChain.addGroup(currentGroup);
currentGroup.trimToSize();
}
allModels.add(currentModel);
currentModel = new ArrayList<>();
currentChain = null;
currentGroup = null;
}
String asymId = labelAsymId.get(atomIndex);
String authId = authAsymId.isDefined()? authAsymId.get(atomIndex) : asymId;
if (currentChain == null) {
currentChain = new ChainImpl();
currentChain.setName(authId);
currentChain.setId(asymId);
currentModel.add(currentChain);
startOfNewChain = true;
}
if (!asymId.equals(currentChain.getId())) {
startOfNewChain = true;
currentChain.addGroup(currentGroup);
Optional testChain = currentModel.stream()
.filter(chain -> chain.getId().equals(asymId))
.findFirst();
if (testChain.isPresent()) {
currentChain = testChain.get();
} else {
currentChain = new ChainImpl();
currentChain.setName(authId);
currentChain.setId(asymId);
}
if (!currentModel.contains(currentChain)) {
currentModel.add(currentChain);
}
}
int authSeqIdInt = authSeqId.isDefined()? authSeqId.get(atomIndex) : (int)seqId;
ResidueNumber residueNumber = new ResidueNumber(authId, authSeqIdInt, insCode);
String recordName = groupPDB.get(atomIndex);
String compId = labelCompId.get(atomIndex);
if (currentGroup == null) {
currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
currentGroup.setResidueNumber(residueNumber);
currentGroup.setPDBName(compId);
currentGroup.setHetAtomInFile(isHetAtmInFile);
}
Group altGroup = null;
String altLocation = labelAltId.isDefined()? labelAltId.get(atomIndex) : null;
if (startOfNewChain) {
currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
currentGroup.setResidueNumber(residueNumber);
currentGroup.setPDBName(compId);
currentGroup.setHetAtomInFile(isHetAtmInFile);
} else {
if (!residueNumber.equals(currentGroup.getResidueNumber())) {
currentChain.addGroup(currentGroup);
currentGroup.trimToSize();
currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
currentGroup.setPDBName(compId);
currentGroup.setResidueNumber(residueNumber);
currentGroup.setHetAtomInFile(isHetAtmInFile);
} else {
if (altLocation != null && !altLocation.isEmpty() && !".".equals(altLocation)) {
altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId);
if (altGroup.getChain() == null) {
altGroup.setChain(currentChain);
}
}
}
}
if (params.isParseCAOnly()) {
if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) {
continue;
}
}
Atom atom = new AtomImpl();
atom.setPDBserial(id.get(atomIndex));
atom.setName(labelAtomId.get(atomIndex));
atom.setX(cartnX.get(atomIndex));
atom.setY(cartnY.get(atomIndex));
atom.setZ(cartnZ.get(atomIndex));
atom.setOccupancy((float) (occupancy.isDefined()? occupancy.get(atomIndex) : 1.0));
atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex));
if (altLocation == null || altLocation.isEmpty() || ".".equals(altLocation)) {
atom.setAltLoc(' ');
} else {
atom.setAltLoc(altLocation.charAt(0));
}
String ts = typeSymbol.get(atomIndex);
try {
Element element = Element.valueOfIgnoreCase(ts);
atom.setElement(element);
} catch (IllegalArgumentException e) {
logger.info("Element {} was not recognised as a BioJava-known element, the element will be " +
"represented as the generic element {}", ts, Element.R.name());
atom.setElement(Element.R);
}
if (altGroup != null) {
altGroup.addAtom(atom);
} else {
currentGroup.addAtom(atom);
}
String atomName = atom.getName();
if (!currentGroup.hasAtom(atomName)) {
if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) {
if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) {
currentGroup.addAtom(atom);
}
}
}
}
}
private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode,
long seqId) {
List atoms = currentGroup.getAtoms();
if (atoms.size() > 0) {
if (atoms.get(0).getAltLoc().equals(altLoc)) {
return currentGroup;
}
}
List altLocs = currentGroup.getAltLocs();
for (Group altLocGroup : altLocs) {
atoms = altLocGroup.getAtoms();
if (atoms.size() > 0) {
for (Atom a1 : atoms) {
if (a1.getAltLoc().equals(altLoc)) {
return altLocGroup;
}
}
}
}
if (threeLetterCode.equals(currentGroup.getPDBName())) {
if (currentGroup.getAtoms().isEmpty()) {
return currentGroup;
}
Group altLocGroup = (Group) currentGroup.clone();
altLocGroup.setAtoms(new ArrayList<>());
altLocGroup.getAltLocs().clear();
currentGroup.addAltLoc(altLocGroup);
return altLocGroup;
}
Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId);
altLocGroup.setPDBName(threeLetterCode);
altLocGroup.setResidueNumber(currentGroup.getResidueNumber());
currentGroup.addAltLoc(altLocGroup);
return altLocGroup;
}
private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) {
Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode);
if (group != null && !group.getChemComp().isEmpty()) {
if (group instanceof AminoAcidImpl) {
AminoAcidImpl aminoAcid = (AminoAcidImpl) group;
aminoAcid.setId(seqId);
} else if (group instanceof NucleotideImpl) {
NucleotideImpl nucleotide = (NucleotideImpl) group;
nucleotide.setId(seqId);
} else if (group instanceof HetatomImpl) {
HetatomImpl hetatom = (HetatomImpl) group;
hetatom.setId(seqId);
}
return group;
}
if ("ATOM".equals(record)) {
if (StructureTools.isNucleotide(threeLetterCode)) {
NucleotideImpl nucleotide = new NucleotideImpl();
group = nucleotide;
nucleotide.setId(seqId);
} else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) {
HetatomImpl hetatom = new HetatomImpl();
group = hetatom;
hetatom.setId(seqId);
} else {
AminoAcidImpl aminoAcid = new AminoAcidImpl();
group = aminoAcid;
aminoAcid.setAminoType(oneLetterCode);
aminoAcid.setId(seqId);
}
} else {
if (StructureTools.isNucleotide(threeLetterCode)) {
NucleotideImpl nucleotide = new NucleotideImpl();
group = nucleotide;
nucleotide.setId(seqId);
} else if (oneLetterCode != null) {
AminoAcidImpl aminoAcid = new AminoAcidImpl();
group = aminoAcid;
aminoAcid.setAminoType(oneLetterCode);
aminoAcid.setId(seqId);
} else {
HetatomImpl hetatom = new HetatomImpl();
hetatom.setId(seqId);
group = hetatom;
}
}
return group;
}
@Override
public void consumeAtomSites(AtomSites atomSites) {
// no atom sites present
if (!atomSites.isDefined() || atomSites.getRowCount() == 0) {
return;
}
try {
parsedScaleMatrix = new Matrix4d(
atomSites.getFractTransfMatrix11().get(0),
atomSites.getFractTransfMatrix12().get(0),
atomSites.getFractTransfMatrix13().get(0),
atomSites.getFractTransfVector1().get(0),
atomSites.getFractTransfMatrix21().get(0),
atomSites.getFractTransfMatrix22().get(0),
atomSites.getFractTransfMatrix23().get(0),
atomSites.getFractTransfVector2().get(0),
atomSites.getFractTransfMatrix31().get(0),
atomSites.getFractTransfMatrix32().get(0),
atomSites.getFractTransfMatrix33().get(0),
atomSites.getFractTransfVector3().get(0),
0,
0,
0,
1
);
} catch (NumberFormatException e) {
logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " +
"be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}",
e.getMessage());
structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
}
}
@Override
public void consumeAuditAuthor(AuditAuthor auditAuthor) {
for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) {
String name = auditAuthor.getName().get(rowIndex);
StringBuilder last = new StringBuilder();
StringBuilder initials = new StringBuilder();
boolean afterComma = false;
for (char c : name.toCharArray()) {
if (c == ' ') {
continue;
}
if (c == ',') {
afterComma = true;
continue;
}
if (afterComma) {
initials.append(c);
} else {
last.append(c);
}
}
StringBuilder newaa = new StringBuilder();
newaa.append(initials);
newaa.append(last);
String auth = pdbHeader.getAuthors();
if (auth == null) {
pdbHeader.setAuthors(newaa.toString());
} else {
auth += "," + newaa.toString();
pdbHeader.setAuthors(auth);
}
}
}
@Override
public void consumeCell(Cell cell) {
if (!cell.isDefined() || cell.getRowCount() == 0) {
return;
}
try {
float a = (float) cell.getLengthA().get(0);
float b = (float) cell.getLengthB().get(0);
float c = (float) cell.getLengthC().get(0);
float alpha = (float) cell.getAngleAlpha().get(0);
float beta = (float) cell.getAngleBeta().get(0);
float gamma = (float) cell.getAngleGamma().get(0);
CrystalCell crystalCell = new CrystalCell();
crystalCell.setA(a);
crystalCell.setB(b);
crystalCell.setC(c);
crystalCell.setAlpha(alpha);
crystalCell.setBeta(beta);
crystalCell.setGamma(gamma);
if (!crystalCell.isCellReasonable()) {
// If the entry describes a structure determined by a technique other than X-ray crystallography,
// cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
// if so we don't add and CrystalCell will be null
logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE);
return;
}
structure.getPDBHeader()
.getCrystallographicInfo()
.setCrystalCell(crystalCell);
} catch (NumberFormatException e){
structure.getPDBHeader()
.getCrystallographicInfo()
.setCrystalCell(null);
logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage());
}
}
@Override
public void consumeChemComp(ChemComp chemComp) {
// TODO not impled in ref
}
@Override
public void consumeChemCompBond(ChemCompBond chemCompBond) {
// TODO not impled in ref
}
@Override
public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) {
for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) {
int id = databasePDBremark.getId().get(rowIndex);
if (id == 2) {
String line = databasePDBremark.getText().get(rowIndex);
int i = line.indexOf("ANGSTROM");
if (i > 5) {
// line contains ANGSTROM info...
String resolution = line.substring(i - 5, i).trim();
// convert string to float
try {
float res = Float.parseFloat(resolution);
pdbHeader.setResolution(res);
} catch (NumberFormatException e) {
logger.info("could not parse resolution from line and ignoring it {}", line);
return;
}
}
}
}
}
private Date convert(LocalDate localDate) {
return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant());
}
@Override
public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) {
logger.debug("got a database revision:{}", databasePDBrev);
Date modDate = null;
for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) {
if (databasePDBrev.getNum().get(rowIndex) == 1) {
String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex);
pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT)));
String date = databasePDBrev.getDate().get(rowIndex);
final Date relDate = convert(LocalDate.parse(date, DATE_FORMAT));
pdbHeader.setRelDate(relDate);
modDate = relDate;
} else {
String dbrev = databasePDBrev.getDate().get(rowIndex);
modDate = convert(LocalDate.parse(dbrev, DATE_FORMAT));
}
pdbHeader.setModDate(modDate);
}
}
@Override
public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) {
List revRecords = pdbHeader.getRevisionRecords();
if (revRecords == null) {
revRecords = new ArrayList<>();
pdbHeader.setRevisionRecords(revRecords);
}
for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) {
revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i));
}
}
@Override
public void consumeEm3dReconstruction(Em3dReconstruction em3dReconstruction) {
this.em3dReconstruction = em3dReconstruction;
for (int rowIndex = 0; rowIndex < em3dReconstruction.getRowCount(); rowIndex++) { //can it have more than 1 value?
final FloatColumn resolution = em3dReconstruction.getResolution();
if (ValueKind.PRESENT.equals(resolution.getValueKind(rowIndex)))
pdbHeader.setResolution((float) resolution.get(rowIndex));
}
//TODO other fields (maybe RFree)?
}
@Override
public void consumeEntity(Entity entity) {
this.entity = entity;
}
@Override
public void consumeEntityPoly(EntityPoly entityPoly) {
this.entityPoly = entityPoly;
}
@Override
public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) {
this.entitySrcGen = entitySrcGen;
}
@Override
public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) {
this.entitySrcNat = entitySrcNat;
}
@Override
public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) {
this.entitySrcSyn = entitySrcSyn;
}
@Override
public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) {
for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) {
Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex));
// first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
// TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex));
//int seqId = Integer.parseInt(entityPolySeq.getNum());
if (g != null && !g.getChemComp().isEmpty()) {
if (g instanceof AminoAcidImpl) {
AminoAcidImpl aa = (AminoAcidImpl) g;
aa.setRecordType(AminoAcid.SEQRESRECORD);
}
} else {
if (entityPolySeq.getMonId().get(rowIndex).length() == 3 &&
StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) {
AminoAcidImpl a = new AminoAcidImpl();
a.setRecordType(AminoAcid.SEQRESRECORD);
Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex));
a.setAminoType(code1);
g = a;
} else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) {
// the group is actually a nucleotide group...
g = new NucleotideImpl();
} else {
logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex));
g = new HetatomImpl();
}
}
// at this stage we don't know about author residue numbers (insertion codes)
// we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly
// sequential and follow the seqres sequence 1 to n)
// later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex)));
g.setPDBName(entityPolySeq.getMonId().get(rowIndex));
entityChain.addGroup(g);
}
}
private Chain getEntityChain(String entityId) {
for (Chain chain : entityChains) {
if (chain.getId().equals(entityId)) {
return chain;
}
}
// does not exist yet, so create...
Chain chain = new ChainImpl();
chain.setId(entityId);
entityChains.add(chain);
return chain;
}
@Override
public void consumeExptl(Exptl exptl) {
for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) {
pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex));
}
}
@Override
public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) {
Date date = null;
for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) {
// first entry in revision history is the release date
if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) {
String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
date = convert(LocalDate.parse(release, DATE_FORMAT));
pdbHeader.setRelDate(date);
} else {
// all other dates are revision dates;
// since this method may be called multiple times,
// the last revision date will "stick"
String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
date = convert(LocalDate.parse(revision, DATE_FORMAT));
}
pdbHeader.setModDate(date);
}
}
@Override
public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) {
// TODO not impled in ref
}
@Override
public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) {
for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) {
// the deposition date field is only available in mmCIF 5.0
StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate();
if (recvdInitialDepositionDate.isDefined()) {
String deposition = recvdInitialDepositionDate.get(rowIndex);
pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT)));
}
}
}
@Override
public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) {
// TODO not considered in ref
}
@Override
public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) {
// TODO not considered in ref
}
@Override
public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) {
// TODO not considered in ref
}
@Override
public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) {
// TODO not impled in ref
}
@Override
public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) {
// TODO not considered in ref
}
@Override
public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) {
// TODO not considered in ref
}
@Override
public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) {
// TODO not considered in ref
}
@Override
public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) {
this.structAssembly = pdbxStructAssembly;
}
@Override
public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) {
this.structAssemblyGen = pdbxStructAssemblyGen;
}
@Override
public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) {
// TODO not considered in ref
}
@Override
public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) {
this.structOpers = pdbxStructOperList;
}
@Override
public void consumeRefine(Refine refine) {
for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) {
// RESOLUTION
ValueKind valueKind = refine.getLsDResHigh().getValueKind(rowIndex);
if (! ValueKind.PRESENT.equals(valueKind)) {
continue;
}
// in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
// there are 2 resolution values, one for each method
// we take the last one found so that behaviour is like in PDB file parsing
double lsDResHigh = refine.getLsDResHigh().get(rowIndex);
// TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0
if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) {
logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}",
lsDResHigh, String.format("%4.2f",pdbHeader.getResolution()));
}
pdbHeader.setResolution((float) lsDResHigh);
FloatColumn lsRFactorRFree = refine.getLsRFactorRFree();
// RFREE
if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) {
logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}",
lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree()));
}
if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) {
pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex));
} else {
// some entries like 2ifo haven't got this field at all
logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
}
// RWORK
FloatColumn lsRFactorRWork = refine.getLsRFactorRWork();
if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) {
logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork()));
}
if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) {
pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex));
} else {
logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
}
}
}
@Override
public void consumeStruct(Struct struct) {
if (struct.isDefined() && struct.getTitle().isDefined()) {
pdbHeader.setTitle(struct.getTitle().get(0));
}
if (struct.isDefined() && struct.getEntryId().isDefined()) {
PdbId pdbId;
String pdbCode = struct.getEntryId().get(0);
if(pdbCode.isBlank()){
pdbId = null;
} else {
try {
pdbId = new PdbId(pdbCode);
} catch (IllegalArgumentException e) {
logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode);
pdbId = null;
}
}
pdbHeader.setPdbId(pdbId);
structure.setPdbId(pdbId);
}
}
@Override
public void consumeStructAsym(StructAsym structAsym) {
this.structAsym = structAsym;
}
@Override
public void consumeStructConf(StructConf structConf) {
// TODO not considered in ref
}
@Override
public void consumeStructConn(StructConn structConn) {
this.structConn = structConn;
}
@Override
public void consumeStructConnType(StructConnType structConnType) {
// TODO not considered in ref
}
@Override
public void consumeStructKeywords(StructKeywords structKeywords) {
ArrayList keywordsList = new ArrayList<>();
StrColumn text = structKeywords.getText();
if (text.isDefined()) {
String keywords = text.get(0);
String[] strings = keywords.split(" *, *");
for (String string : strings) {
keywordsList.add(string.trim());
}
}
structure.getPDBHeader().setKeywords(keywordsList);
StrColumn pdbxKeywords = structKeywords.getPdbxKeywords();
if (pdbxKeywords.isDefined()) {
String keywords = pdbxKeywords.get(0);
pdbHeader.setClassification(keywords);
//This field should be left empty. TODO The next line should be removed later
pdbHeader.setDescription(keywords);
}
}
@Override
public void consumeStructNcsOper(StructNcsOper structNcsOper) {
this.structNcsOper = structNcsOper;
}
@Override
public void consumeStructRef(StructRef structRef) {
this.structRef = structRef;
}
@Override
public void consumeStructRefSeq(StructRefSeq structRefSeq) {
for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) {
String refId = structRefSeq.getRefId().get(rowIndex);
DBRef dbRef = new DBRef();
dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().isDefined()? structRefSeq.getPdbxPDBIdCode().get(rowIndex):null);
dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().isDefined()? structRefSeq.getPdbxDbAccession().get(rowIndex):null);
dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().isDefined()? structRefSeq.getPdbxDbAccession().get(rowIndex):null);
dbRef.setChainName(structRefSeq.getPdbxStrandId().isDefined()? structRefSeq.getPdbxStrandId().get(rowIndex):null);
OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount())
.filter(i -> structRef.getId().get(i).equals(refId))
.findFirst();
if (structRefRowIndex.isPresent()) {
dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt()));
dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt()));
} else {
logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex);
}
int seqBegin;
int seqEnd;
char beginInsCode = ' ';
char endInsCode = ' ';
if (structRefSeq.getPdbxAuthSeqAlignBeg().isDefined() && structRefSeq.getPdbxAuthSeqAlignEnd().isDefined()) {
try {
seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex));
seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex));
} catch (NumberFormatException e) {
// this happens in a few entries, annotation error? e.g. 6eoj
logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " +
"alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage());
return;
}
String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex);
if (pdbxSeqAlignBegInsCode.length() > 0) {
beginInsCode = pdbxSeqAlignBegInsCode.charAt(0);
}
String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex);
if (pdbxSeqAlignEndInsCode.length() > 0) {
endInsCode = pdbxSeqAlignEndInsCode.charAt(0);
}
if (beginInsCode == '?') {
beginInsCode = ' ';
}
if (endInsCode == '?') {
endInsCode = ' ';
}
} else {
seqBegin = structRefSeq.getSeqAlignBeg().get(rowIndex);
seqEnd = structRefSeq.getSeqAlignEnd().get(rowIndex);
}
dbRef.setSeqBegin(seqBegin);
dbRef.setInsertBegin(beginInsCode);
dbRef.setSeqEnd(seqEnd);
dbRef.setInsertEnd(endInsCode);
int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex);
int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex);
char dbBeginInsCode = ' ';
StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode();
if (pdbxDbAlignBegInsCodeCol.isDefined()) {
String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex);
if (pdbxDbAlignBegInsCode.length() > 0) {
dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0);
}
}
char dbEndInsCode = ' ';
StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode();
if (pdbxDbAlignEndInsCodeCol.isDefined()) {
String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex);
if (pdbxDbAlignEndInsCode.length() > 0) {
dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0);
}
}
if (dbBeginInsCode == '?') {
dbBeginInsCode = ' ';
}
if (dbEndInsCode == '?') {
dbEndInsCode = ' ';
}
dbRef.setDbSeqBegin(dbSeqBegin);
dbRef.setIdbnsBegin(dbBeginInsCode);
dbRef.setDbSeqEnd(dbSeqEnd);
dbRef.setIdbnsEnd(dbEndInsCode);
List dbrefs = structure.getDBRefs();
if (dbrefs == null) {
dbrefs = new ArrayList<>();
}
dbrefs.add(dbRef);
logger.debug(dbRef.toPDB());
structure.setDBRefs(dbrefs);
}
}
@Override
public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) {
this.structRefSeqDif = structRefSeqDif;
}
@Override
public void consumeStructSheetRange(StructSheetRange structSheetRange) {
// TODO not considered in ref
}
@Override
public void consumeStructSite(StructSite structSite) {
if (params.isHeaderOnly()) {
return;
}
List sites = structure.getSites();
if (sites == null) {
sites = new ArrayList<>();
}
for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) {
Site site = null;
for (Site asite : sites) {
if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) {
site = asite; // prevent duplicate siteIds
}
}
boolean addSite = false;
if (site == null) {
site = new Site();
addSite = true;
}
site.setSiteID(structSite.getId().get(rowIndex));
site.setDescription(structSite.getDetails().get(rowIndex));
site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex));
if (addSite) {
sites.add(site);
}
}
structure.setSites(sites);
}
@Override
public void consumeStructSiteGen(StructSiteGen structSiteGen) {
this.structSiteGen = structSiteGen;
}
@Override
public void consumeSymmetry(Symmetry symmetry) {
for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) {
String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex);
SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString);
if (spaceGroup == null) {
logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString);
structure.getPDBHeader()
.getCrystallographicInfo()
.setNonStandardSg(true);
} else {
structure.getPDBHeader()
.getCrystallographicInfo()
.setSpaceGroup(spaceGroup);
structure.getPDBHeader()
.getCrystallographicInfo()
.setNonStandardSg(false);
}
}
}
@Override
public void finish() {
if (currentChain != null) {
currentChain.addGroup(currentGroup);
Optional testChain = currentModel.stream()
.filter(chain -> chain.getId().equals(currentChain.getId()))
.findFirst();
if (!testChain.isPresent()) {
currentModel.add(currentChain);
}
} else if (!params.isHeaderOnly()) {
logger.warn("No chains were instantiated after parsing the whole CIF document. This could be due to the atom_site category being absent");
}
allModels.add(currentModel);
initMaps();
for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
String id = structAsym.getId().get(rowIndex);
String entityId = structAsym.getEntityId().get(rowIndex);
logger.debug("Entity {} matches asym_id: {}", entityId, id);
Chain chain = getEntityChain(entityId);
Chain seqRes = (Chain) chain.clone();
// to solve issue #160 (e.g. 3u7t)
seqRes = removeSeqResHeterogeneity(seqRes);
seqRes.setId(id);
seqRes.setName(asymId2authorId.getOrDefault(id, id));
EntityType type = EntityType.entityTypeFromString(getEntityType(entityId));
if (type == null || type == EntityType.POLYMER) {
seqResChains.add(seqRes);
}
logger.debug(" seqres: {} {}<", id, seqRes);
addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId));
}
if (!structAsym.isDefined() || structAsym.getRowCount() == 0) {
logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
}
// entities
// In addEntities above we created the entities if they were present in the file
// Now we need to make sure that they are linked to chains and also that if they are not present in the file we
// need to add them now
linkEntities();
// now that we know the entities, we can add all chains to structure so that they are stored
// properly as polymer/nonpolymer/water chains inside structure
allModels.forEach(structure::addModel);
// Only align if requested (default) and not when headerOnly mode with no Atoms.
// Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
if (params.isAlignSeqRes() && !params.isHeaderOnly()){
logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
alignSeqRes();
} else {
logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
}
// Now make sure all altlocgroups have all the atoms in all the groups
StructureTools.cleanUpAltLocs(structure);
// NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
if (!params.isHeaderOnly()) {
if (params.shouldCreateAtomBonds()) {
addBonds();
}
if (params.shouldCreateAtomCharges()) {
addCharges();
}
}
if (!params.isHeaderOnly()) {
addSites();
}
// set the oligomeric state info in the header...
if (params.isParseBioAssembly()) {
// the more detailed mapping of chains to rotation operations happens in StructureIO...
Map bioAssemblies = new LinkedHashMap<>();
for (int i = 0; i < structAssembly.getRowCount(); i++) {
String assemblyId = structAssembly.getId().get(i);
List structAssemblyGenIndices = new ArrayList<>();
for (int j = 0; j < structAssemblyGen.getRowCount(); j++) {
if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) {
structAssemblyGenIndices.add(j);
}
}
BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
// these are the transformations that need to be applied to our model
List transformations = builder.getBioUnitTransformationList(structAssembly,
i, structAssemblyGen, structOpers);
int bioAssemblyId = -1;
try {
bioAssemblyId = Integer.parseInt(assemblyId);
} catch (NumberFormatException e) {
logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId);
}
// if bioassembly id is not numerical we throw it away
// this happens usually for viral capsid entries, like 1ei7
// see issue #230 in github
if (bioAssemblyId != -1) {
int mmSize = 0;
// note that the transforms contain asym ids of both polymers and non-polymers
// For the mmsize, we are only interested in the polymers
for (BiologicalAssemblyTransformation transf : transformations) {
Chain c = structure.getChain(transf.getChainId());
if (c == null) {
logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
continue;
}
if (c.getEntityType() == EntityType.POLYMER &&
// for entries like 4kro, sugars are annotated as polymers but we
// don't want them in the macromolecularSize count
!c.getEntityInfo().getDescription().contains("SUGAR")) {
mmSize++;
}
}
BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
bioAssembly.setId(bioAssemblyId);
bioAssembly.setMacromolecularSize(mmSize);
bioAssembly.setTransforms(transformations);
bioAssemblies.put(bioAssemblyId, bioAssembly);
}
}
structure.getPDBHeader()
.setBioAssemblies(bioAssemblies);
}
setStructNcsOps();
setCrystallographicInfoMetadata();
Map> misMatchMap = new HashMap<>();
for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) {
SeqMisMatch seqMisMatch = new SeqMisMatchImpl();
seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex));
String insCode = null;
if (structRefSeqDif.getPdbxPdbInsCode().isDefined()) {
insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex);
if ("?".equals(insCode)) {
insCode = null;
}
}
seqMisMatch.setInsCode(insCode);
seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex));
seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex));
seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().isDefined()? structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex):null);
seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().isDefined()? structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex):null);
seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex));
if (!structRefSeqDif.getPdbxPdbStrandId().isDefined()) continue;
String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex);
List seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>());
seqMisMatches.add(seqMisMatch);
}
for (String chainId : misMatchMap.keySet()){
Chain chain = structure.getPolyChainByPDB(chainId);
if (chain == null) {
logger.warn("Could not set mismatches for chain with author id {}", chainId);
continue;
}
chain.setSeqMisMatches(misMatchMap.get(chainId));
}
}
private String getEntityType(String entityId) {
return IntStream.range(0, entity.getRowCount())
.filter(i -> entity.getId().get(i).equals(entityId))
.mapToObj(i -> entity.getType().get(i))
.findFirst()
.orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
}
private String getEntityDescription(String entityId) {
return IntStream.range(0, entity.getRowCount())
.filter(i -> entity.getId().get(i).equals(entityId))
.mapToObj(i -> entity.getPdbxDescription().isDefined()? entity.getPdbxDescription().get(i):"")
.findFirst()
.orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
}
private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) {
int eId = 0;
try {
eId = Integer.parseInt(entityId);
} catch (NumberFormatException e) {
logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId);
}
int entityRowIndex = IntStream.range(0, entity.getRowCount())
.filter(i -> entity.getId().get(i).equals(entityId))
.findFirst()
.orElse(-1);
EntityInfo entityInfo = structure.getEntityById(eId);
if (entityInfo == null) {
entityInfo = new EntityInfo();
entityInfo.setMolId(eId);
// we only add the compound if a polymeric one (to match what the PDB parser does)
if (entityRowIndex != -1) {
entityInfo.setDescription(pdbxDescription);
EntityType eType = EntityType.entityTypeFromString(type);
if (eType != null) {
entityInfo.setType(eType);
} else {
logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId);
}
addAncilliaryEntityData(asymRowIndex, entityInfo);
structure.addEntityInfo(entityInfo);
logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId,
entityInfo.getDescription());
}
}
}
private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) {
// Loop through each of the entity types and add the corresponding data
// We're assuming if data is duplicated between sources it is consistent
// This is a potentially huge assumption...
for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) {
if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
continue;
}
addInformationFromEntitySrcGen(rowIndex, entityInfo);
}
for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) {
if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
continue;
}
addInformationFromEntitySrcNat(rowIndex, entityInfo);
}
for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) {
if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
continue;
}
addInformationFromEntitySrcSyn(rowIndex, entityInfo);
}
}
private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) {
entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcSyn.getOrganismCommonName(), rowIndex, null));
entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcSyn.getOrganismScientific(), rowIndex, null));
entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcSyn.getNcbiTaxonomyId(), rowIndex, null));
}
private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) {
entityInfo.setAtcc(getCifFieldNullAware(entitySrcNat.getPdbxAtcc(), rowIndex, null));
entityInfo.setCell(getCifFieldNullAware(entitySrcNat.getPdbxCell(), rowIndex, null));
entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcNat.getCommonName(), rowIndex, null));
entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcNat.getPdbxOrganismScientific(), rowIndex, null));
entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcNat.getPdbxNcbiTaxonomyId(), rowIndex, null));
}
private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) {
entityInfo.setAtcc(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcAtcc(), rowIndex, null));
entityInfo.setCell(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcCell(), rowIndex, null));
entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcGen.getGeneSrcCommonName(), rowIndex, null));
entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcScientificName(), rowIndex, null));
entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId(), rowIndex, null));
entityInfo.setExpressionSystemTaxId(getCifFieldNullAware(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId(), rowIndex, null));
entityInfo.setExpressionSystem(getCifFieldNullAware(entitySrcGen.getPdbxHostOrgScientificName(), rowIndex, null));
}
private String getCifFieldNullAware(StrColumn column, int rowIndex, String defaultValue) {
if (column.isDefined())
return column.get(rowIndex);
else
return defaultValue;
}
private void setStructNcsOps() {
List ncsOperators = new ArrayList<>();
for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) {
if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) {
continue;
}
try {
Matrix4d operator = new Matrix4d();
operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex));
operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex));
operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex));
operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex));
operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex));
operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex));
operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex));
operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex));
operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex));
operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex));
operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex));
operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex));
operator.setElement(3, 0, 0);
operator.setElement(3, 1, 0);
operator.setElement(3, 2, 0);
operator.setElement(3, 3, 1);
ncsOperators.add(operator);
} catch (NumberFormatException e) {
logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1);
}
}
if (ncsOperators.size() > 0) {
structure.getCrystallographicInfo()
.setNcsOperators(ncsOperators.toArray(new Matrix4d[0]));
}
}
private void setCrystallographicInfoMetadata() {
if (parsedScaleMatrix != null) {
PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
boolean nonStd = false;
if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
nonStd = true;
}
crystalInfo.setNonStandardCoordFrameConvention(nonStd);
}
}
private void addSites() {
List sites = structure.getSites();
if (sites == null) sites = new ArrayList<>();
for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) {
// For each StructSiteGen, find the residues involved, if they exist then
String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site.
if (site_id == null) {
site_id = "";
}
String comp_id = structSiteGen.getLabelCompId().get(rowIndex); // PDBName
// Assumption: the author chain ID and residue number for the site is consistent with the original
// author chain id and residue numbers.
String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name
String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id
String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num
String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex);
if ("?".equals(insCode)) {
insCode = null;
}
// Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname.
Group g = null;
try {
Chain chain = structure.getChain(asymId);
if (null != chain) {
try {
Character insChar = null;
if (null != insCode && insCode.length() > 0) {
insChar = insCode.charAt(0);
}
g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
} catch (NumberFormatException e) {
logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id);
}
}
} catch (StructureException e) {
logger.warn("Problem finding residue in site entry {} - {}",
structSiteGen.getSiteId().get(rowIndex), e.getMessage());
}
if (g != null) {
// 2. find the site_id, if not existing, create anew.
Site site = null;
for (Site asite : sites) {
if (site_id.equals(asite.getSiteID())) {
site = asite;
}
}
boolean addSite = false;
// 3. add this residue to the site.
if (site == null) {
addSite = true;
site = new Site();
site.setSiteID(site_id);
}
List groups = site.getGroups();
if (groups == null) {
groups = new ArrayList<>();
}
// Check the self-consistency of the residue reference from auth_seq_id and chain_id
if (!comp_id.equals(g.getPDBName())) {
logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id);
} else {
groups.add(g);
site.setGroups(groups);
}
if (addSite) {
sites.add(site);
}
}
}
structure.setSites(sites);
}
private void addCharges() {
ChargeAdder.addCharges(structure);
}
/**
* The method will return a new reference to a Chain with any consecutive groups
* having same residue numbers removed.
* This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
*/
private static Chain removeSeqResHeterogeneity(Chain c) {
Chain trimmedChain = new ChainImpl();
ResidueNumber lastResNum = null;
for (Group g : c.getAtomGroups()) {
// note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
ResidueNumber currentResNum = new ResidueNumber(
g.getResidueNumber().getChainName(),
g.getResidueNumber().getSeqNum(),
g.getResidueNumber().getInsCode());
if (lastResNum == null || !lastResNum.equals(currentResNum)) {
trimmedChain.addGroup(g);
} else {
logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': {}", g);
}
lastResNum = currentResNum;
}
return trimmedChain;
}
private void addBonds() {
BondMaker maker = new BondMaker(structure, params);
maker.makeBonds();
maker.formBondsFromStructConn(structConn);
}
private void alignSeqRes() {
logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
// fix SEQRES residue numbering for all models
for (int model = 0; model < structure.nrModels(); model++) {
List atomList = structure.getPolyChains(model);
if (seqResChains.isEmpty()) {
// in files without _entity, seqResChains object is empty: we replace by atomChains resulting below in a trivial alignment and a copy of atom groups to seqres groups
seqResChains = atomList;
}
for (Chain seqResChain : seqResChains){
// this extracts the matching atom chain from atomList
Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
if (atomChain == null) {
// most likely there's no observed residues at all for the seqres chain: can't map
// e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " +
"no observed residues in the chain.", seqResChain.getId());
continue;
}
//map the atoms to the seqres...
// we need to first clone the seqres so that they stay independent for different models
List seqResGroups = new ArrayList<>();
for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) {
seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
}
for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
Group seqresG = seqResGroups.get(seqResPos);
boolean found = false;
for (Group atomG : atomChain.getAtomGroups()) {
int internalNr = getInternalNr(atomG);
if (seqresG.getResidueNumber().getSeqNum() == internalNr) {
seqResGroups.set(seqResPos, atomG);
found = true;
break;
}
}
if (!found)
// so far the residue number has tracked internal numbering.
// however there are no atom records, as such this can't be a PDB residue number...
seqresG.setResidueNumber(null);
}
atomChain.setSeqResGroups(seqResGroups);
}
}
}
private int getInternalNr(Group atomG) {
if (atomG.getType().equals(GroupType.AMINOACID)) {
AminoAcidImpl aa = (AminoAcidImpl) atomG;
return (int) aa.getId();
} else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) {
NucleotideImpl nu = (NucleotideImpl) atomG;
return (int) nu.getId();
} else {
HetatomImpl he = (HetatomImpl) atomG;
return (int) he.getId();
}
}
private void linkEntities() {
for (List allModel : allModels) {
for (Chain chain : allModel) {
//logger.info("linking entities for " + chain.getId() + " " + chain.getName());
String entityId = asymId2entityId.get(chain.getId());
if (entityId == null) {
// this can happen for instance if the cif file didn't have _struct_asym category at all
// and thus we have no asymId2entityId mapping at all
logger.info("No entity id could be found for chain {}", chain.getId());
continue;
}
int eId = Integer.parseInt(entityId);
// Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
// TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
// asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the
// mmCIF internal data structures but is compatible with Structure interface.
// Some examples of PDB entries with this kind of problem:
// - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
// - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
// - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
EntityInfo entityInfo = structure.getEntityById(eId);
if (entityInfo == null) {
// Supports the case where the only chain members were from non-polymeric entity that is missing.
// Solved by creating a new Compound(entity) to which this chain will belong.
logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
eId, chain.getId());
entityInfo = new EntityInfo();
entityInfo.setMolId(eId);
entityInfo.addChain(chain);
if (chain.isWaterOnly()) {
entityInfo.setType(EntityType.WATER);
} else {
entityInfo.setType(EntityType.NONPOLYMER);
}
chain.setEntityInfo(entityInfo);
structure.addEntityInfo(entityInfo);
} else {
logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
chain.getId(), chain.getName(), eId);
entityInfo.addChain(chain);
chain.setEntityInfo(entityInfo);
}
}
}
// if no entity information was present in file we then go and find the entities heuristically with EntityFinder
List entityInfos = structure.getEntityInfos();
if (entityInfos == null || entityInfos.isEmpty()) {
List> polyModels = new ArrayList<>();
List> nonPolyModels = new ArrayList<>();
List> waterModels = new ArrayList<>();
for (List model : allModels) {
List polyChains = new ArrayList<>();
List nonPolyChains = new ArrayList<>();
List waterChains = new ArrayList<>();
polyModels.add(polyChains);
nonPolyModels.add(nonPolyChains);
waterModels.add(waterChains);
for (Chain chain : model) {
// we only have entities for polymeric chains, all others are ignored for assigning entities
if (chain.isWaterOnly()) {
waterChains.add(chain);
} else if (chain.isPureNonPolymer()) {
nonPolyChains.add(chain);
} else {
polyChains.add(chain);
}
}
}
entityInfos = EntityFinder.findPolyEntities(polyModels);
EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
structure.setEntityInfos(entityInfos);
}
// final sanity check: it can happen that from the annotated entities some are not linked to any chains
// e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
// we simply log it, this can sign some other problems if the entities are used down the line
for (EntityInfo e : entityInfos) {
if (e.getChains().isEmpty()) {
logger.info("Entity {} '{}' has no chains associated to it",
e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription());
}
}
}
private void initMaps() {
if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) {
logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
return;
}
Map> entityId2asymId = new HashMap<>();
for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
String id = structAsym.getId().get(rowIndex);
String entityId = structAsym.getEntityId().get(rowIndex);
logger.debug("Entity {} matches asym_id: {}", entityId, id);
asymId2entityId.put(id, entityId);
if (entityId2asymId.containsKey(entityId)) {
List asymIds = entityId2asymId.get(entityId);
asymIds.add(id);
} else {
List asymIds = new ArrayList<>();
asymIds.add(id);
entityId2asymId.put(entityId, asymIds);
}
}
if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) {
logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " +
"for header only parsing");
return;
}
for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) {
if (!entityPoly.getPdbxStrandId().isDefined()) {
logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " +
"author ids for this entity.", entityPoly.getEntityId().get(rowIndex));
break;
}
String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(",");
List asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex));
if (asymIds == null) {
logger.warn("No asym ids found for entity {} in _struct_asym. Can't provide a mapping from asym ids to author chain ids for this entity", entityPoly.getEntityId().get(rowIndex));
break;
}
if (chainNames.length != asymIds.size()) {
logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " +
"for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " +
"ids", entityPoly.getEntityId().get(rowIndex));
break;
}
for (int i = 0; i < chainNames.length; i++) {
asymId2authorId.put(asymIds.get(i), chainNames[i]);
}
}
}
@Override
public Structure getContainer() {
return structure;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy