All Downloads are FREE. Search and download functionalities are using the official Maven repository.

pl.poznan.put.pdb.analysis.CifConverter Maven / Gradle / Ivy

package pl.poznan.put.pdb.analysis;

import org.apache.commons.collections4.BidiMap;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.bidimap.TreeBidiMap;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.poznan.put.pdb.ImmutablePdbAtomLine;
import pl.poznan.put.pdb.ImmutablePdbRemark465Line;
import pl.poznan.put.pdb.PdbAtomLine;
import pl.poznan.put.pdb.PdbModresLine;
import pl.poznan.put.pdb.PdbRemark2Line;
import pl.poznan.put.pdb.PdbRemark465Line;
import pl.poznan.put.structure.BasePair;
import pl.poznan.put.structure.QuantifiedBasePair;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * A converter from mmCIF to one or more PDB files. It takes care of formats' mismatches (e.g.
 * multi-character chain names in mmCIF vs single-character in PDB).
 */
public final class CifConverter {
  private static final Logger LOGGER = LoggerFactory.getLogger(CifConverter.class);

  private static final int MAX_RESIDUE_NUMBER = 9999;
  private static final int MAX_ATOM_SERIAL_NUMBER = 99999;

  // PRINTABLE_CHARS is a set of chain names that we allow
  private static final List PRINTABLE_CHARS =
      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
          .chars()
          .mapToObj(i -> (char) i)
          .map(String::valueOf)
          .collect(Collectors.toList());

  private CifConverter() {
    super();
  }

  /**
   * Parses a file in mmCIF format and convert it into a container of multiple PDB files.
   *
   * @param cifFile Path to mmCIF file.
   * @return A container of (possibly) multiple PDB files with mapped chain names.
   * @throws IOException When reading of mmCIF file or writing to output files fails.
   */
  public static ModelContainer convert(final File cifFile) throws IOException {
    final String cifContents = FileUtils.readFileToString(cifFile, Charset.defaultCharset());
    final List models = CifParser.parse(cifContents);
    return CifConverter.convert(cifFile, models);
  }

  /**
   * Converts a parsed mmCIF model into a set of PDB files with mapped chain names.
   *
   * @param model A parse mmCIF model.
   * @return A container of (possibly) multiple PDB files with mapped chain names.
   * @throws IOException When writing to output files fails.
   */
  public static ModelContainer convert(final DefaultCifModel model) throws IOException {
    final File cifFile = File.createTempFile("cif2pdb", ".cif");
    FileUtils.write(cifFile, model.toCif(), Charset.defaultCharset());

    if (!CifConverter.isConversionPossible(model)) {
      return CifContainer.emptyInstance(cifFile);
    }

    List> chainGroups = CifConverter.groupContactingChains(model);
    chainGroups = CifConverter.packGroups(chainGroups);
    final Map> fileChainMap = new HashMap<>();

    for (final Set chainGroup : chainGroups) {
      final File pdbFile = File.createTempFile("cif2pdb", ".pdb");
      final BidiMap chainMap = new TreeBidiMap<>();

      fileChainMap.put(pdbFile, chainMap);

      final StringBuilder pdbBuilder = new StringBuilder();
      CifConverter.writeHeader(model, chainMap, pdbBuilder);
      CifConverter.writeModel(model, chainGroup, chainMap, pdbBuilder);

      final String pdbData = pdbBuilder.toString();
      FileUtils.write(pdbFile, pdbData, Charset.defaultCharset());
    }

    return ImmutableCifContainer.of(cifFile, fileChainMap);
  }

  private static ModelContainer convert(final File cifFile, final List models)
      throws IOException {
    final List rnaModels = new ArrayList<>();

    for (final CifModel model : models) {
      if (model.containsAny(MoleculeType.RNA)) {
        final CifModel rnaModel = (CifModel) model.filteredNewInstance(MoleculeType.RNA);
        rnaModels.add(rnaModel);
      }
    }

    if (rnaModels.isEmpty()) {
      CifConverter.LOGGER.info("Neither model contain any RNA chain");
      return CifContainer.emptyInstance(cifFile);
    }

    for (final PdbModel model : rnaModels) {
      if (!CifConverter.isConversionPossible(model)) {
        return CifContainer.emptyInstance(cifFile);
      }
    }

    final CifModel firstModel = rnaModels.get(0);
    List> chainGroups = CifConverter.groupContactingChains(firstModel);
    chainGroups = CifConverter.packGroups(chainGroups);
    final Map> fileChainMap = new HashMap<>();

    for (final Set chainGroup : chainGroups) {
      final File pdbFile = File.createTempFile("cif2pdb", ".pdb");
      final BidiMap chainMap = new TreeBidiMap<>();

      fileChainMap.put(pdbFile, chainMap);

      final StringBuilder pdbBuilder = new StringBuilder();
      CifConverter.writeHeader(firstModel, chainMap, pdbBuilder);
      for (final PdbModel model : rnaModels) {
        CifConverter.writeModel(model, chainGroup, chainMap, pdbBuilder);
      }

      final String pdbData = pdbBuilder.toString();
      FileUtils.write(pdbFile, pdbData, Charset.defaultCharset());
    }

    return ImmutableCifContainer.of(cifFile, fileChainMap);
  }

  private static boolean isConversionPossible(final PdbModel model) {
    for (final PdbChain chain : model.chains()) {
      for (final PdbResidue residue : chain.residues()) {
        if (residue.residueNumber() > CifConverter.MAX_RESIDUE_NUMBER) {
          CifConverter.LOGGER.error(
              "Cannot continue. Chain {} has residue of index > 9999", chain.identifier());
          return false;
        }
      }
    }
    return true;
  }

  /**
   * Merges chains which are in contact to form groups.
   *
   * @param model A parsed mmCIF model.
   * @return A list of sets of chains' identifiers. Each set contains chains which are in contact
   *     with each other (single linkage i.e. each chain has at least one contact in its set).
   */
  private static List> groupContactingChains(final CifModel model) {
    final List> chainGroups = CifConverter.initializeChainGroups(model);
    final Map> chainContacts = CifConverter.initializeChainContactMap(model);
    int i = 0;

    while ((chainGroups.size() > 1) && (i < chainGroups.size())) {
      final Set groupL = chainGroups.get(i);
      int toMerge = -1;

      for (int j = i + 1; (toMerge == -1) && (j < chainGroups.size()); j++) {
        final Set groupR = chainGroups.get(j);

        if (groupL.stream()
            .filter(chainContacts::containsKey)
            .map(chainContacts::get)
            .anyMatch(contactsL -> CollectionUtils.containsAny(contactsL, groupR))) {
          toMerge = j;
        }
      }

      if (toMerge == -1) {
        i += 1;
      } else {
        final Set groupToMerge = chainGroups.get(toMerge);
        groupL.addAll(groupToMerge);
        chainGroups.remove(toMerge);
        i = 0;
      }
    }

    return chainGroups;
  }

  /**
   * For each chain, create a set which contains only this chain.
   *
   * @param model A parsed coordinates of a mmCIF file.
   * @return A list of sets, each containing one chain.
   */
  private static List> initializeChainGroups(final PdbModel model) {
    return model.chains().stream()
        .map(chain -> new HashSet<>(Collections.singleton(chain.identifier())))
        .collect(Collectors.toList());
  }

  /**
   * Basing on the information in the mmCIF file itself, analyze which chains are in contact. Return
   * a mapping of chain name to a set of its contacts.
   *
   * @param model A parsed mmCIF model.
   * @return A map, where chain name is a key and set of this chain's contacts is the value.
   */
  private static Map> initializeChainContactMap(final CifModel model) {
    final Map> chainContacts = new HashMap<>();

    for (final QuantifiedBasePair quantifiedBasePair : model.basePairs()) {
      final BasePair basePair = quantifiedBasePair.basePair();
      final String left = basePair.left().chainIdentifier();
      final String right = basePair.right().chainIdentifier();

      if (!chainContacts.containsKey(left)) {
        chainContacts.put(left, new HashSet<>());
      }
      chainContacts.get(left).add(right);

      if (!chainContacts.containsKey(right)) {
        chainContacts.put(right, new HashSet<>());
      }
      chainContacts.get(right).add(left);
    }

    return chainContacts;
  }

  /**
   * Solve bin packing problem using first-fit decreasing heuristic. In other words, put as many
   * chains into as few separate files as possible.
   *
   * @param chainGroups List of chain groups. A chain group contains identifiers of chains which are
   *     in contact.
   * @return List of packed chain groups. A packed chain group contains one or more regular chain
   *     groups such that they can be fitted into a single PDB file.
   */
  private static List> packGroups(final List> chainGroups) {
    // sort chain groups in descending size order
    chainGroups.sort((t, t1) -> -Integer.compare(t.size(), t1.size()));

    final List> packed = new ArrayList<>();

    for (final Set group : chainGroups) {
      boolean flag = true;

      for (final Set bin : packed) {
        if ((bin.size() + group.size()) <= CifConverter.PRINTABLE_CHARS.size()) {
          bin.addAll(group);
          flag = false;
          break;
        }
      }

      if (flag) {
        final Set bin = new HashSet<>(group);
        packed.add(bin);
      }
    }

    return packed;
  }

  private static void writeHeader(
      final PdbModel firstModel,
      final BidiMap chainMap,
      final StringBuilder pdbBuilder) {
    pdbBuilder.append(firstModel.header()).append(System.lineSeparator());
    if (!firstModel.experimentalData().experimentalTechniques().isEmpty()) {
      pdbBuilder.append(firstModel.experimentalData()).append(System.lineSeparator());
    }
    pdbBuilder.append(PdbRemark2Line.PROLOGUE).append(System.lineSeparator());
    pdbBuilder.append(firstModel.resolution()).append(System.lineSeparator());

    final List missingResidues = firstModel.missingResidues();
    if (!missingResidues.isEmpty()) {
      pdbBuilder.append(PdbRemark465Line.PROLOGUE).append(System.lineSeparator());

      for (PdbRemark465Line missingResidue : missingResidues) {
        String chainIdentifier = missingResidue.chainIdentifier();
        final MoleculeType moleculeType = CifConverter.getChainType(firstModel, chainIdentifier);
        if (moleculeType == MoleculeType.RNA) {
          chainIdentifier = CifConverter.mapChain(chainMap, chainIdentifier);
          missingResidue =
              ImmutablePdbRemark465Line.copyOf(missingResidue).withChainIdentifier(chainIdentifier);
          pdbBuilder.append(missingResidue).append(System.lineSeparator());
        }
      }
    }

    for (final PdbModresLine modifiedResidue : firstModel.modifiedResidues()) {
      pdbBuilder.append(modifiedResidue).append(System.lineSeparator());
    }
  }

  /**
   * Return type of the named chain.
   *
   * @param firstModel A PDB/mmCIF model.
   * @param chainIdentifier Name of the chain to check.
   * @return A {@link MoleculeType} instance representing chain's type or {@link
   *     MoleculeType#UNKNOWN} if the chain is not present.
   */
  private static MoleculeType getChainType(
      final PdbModel firstModel, final String chainIdentifier) {
    return firstModel.chains().stream()
        .filter(chain -> Objects.equals(chain.identifier(), chainIdentifier))
        .findFirst()
        .map(SingleTypedResidueCollection::moleculeType)
        .orElse(MoleculeType.UNKNOWN);
  }

  /**
   * For a chain identifier in mmCIF, return its single-letter mapping from PDB. If the mmCIF
   * identifier is encountered for the first time, add a new mapping.
   *
   * @param chainMap A map where key is the mmCIF name and value is the PDB name.
   * @param chainIdentifier The mmCIF identifier.
   * @return The PDB identifier.
   */
  private static String mapChain(
      final BidiMap chainMap, final String chainIdentifier) {
    if (!chainMap.containsKey(chainIdentifier)) {
      chainMap.put(chainIdentifier, CifConverter.PRINTABLE_CHARS.get(chainMap.size()));
    }
    return chainMap.get(chainIdentifier);
  }

  private static void writeModel(
      final PdbModel rnaModel,
      final Collection allowedChains,
      final BidiMap chainMap,
      final StringBuilder pdbBuilder) {
    pdbBuilder.append("MODEL ").append(rnaModel.modelNumber()).append(System.lineSeparator());

    int serialNumber = 1;

    for (final PdbChain chain : rnaModel.chains()) {
      if (allowedChains.contains(chain.identifier())) {
        for (final PdbResidue residue : chain.residues()) {
          for (final PdbAtomLine atom : residue.atoms()) {
            final String chainIdentifier = CifConverter.mapChain(chainMap, atom.chainIdentifier());
            final ImmutablePdbAtomLine atomLine =
                ((ImmutablePdbAtomLine) atom)
                    .withSerialNumber(serialNumber)
                    .withChainIdentifier(chainIdentifier);
            serialNumber =
                (serialNumber < CifConverter.MAX_ATOM_SERIAL_NUMBER) ? (serialNumber + 1) : 1;
            pdbBuilder.append(atomLine).append(System.lineSeparator());
          }
        }
      }
    }

    pdbBuilder.append("ENDMDL");
    pdbBuilder.append(System.lineSeparator());
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy