All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.snpEffect.commandLine.SnpEffCmdPdb Maven / Gradle / Ivy

The newest version!
package org.snpeff.snpEffect.commandLine;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPOutputStream;

import org.biojava.nbio.structure.AminoAcid;
import org.biojava.nbio.structure.Atom;
import org.biojava.nbio.structure.Calc;
import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.DBRef;
import org.biojava.nbio.structure.EntityInfo;
import org.biojava.nbio.structure.Group;
import org.biojava.nbio.structure.GroupType;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.io.PDBFileReader;
import org.snpeff.SnpEff;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Transcript;
import org.snpeff.pdb.DistanceResult;
import org.snpeff.pdb.IdMapper;
import org.snpeff.pdb.IdMapperEntry;
import org.snpeff.pdb.PdbFile;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;

import htsjdk.samtools.util.RuntimeEOFException;

/**
 * PDB distance analysis
 *
 * References: http://biojava.org/wiki/BioJava:CookBook:PDB:read
 *
 * @author pcingola
 */
public class SnpEffCmdPdb extends SnpEff {

	public static final String DEFAULT_PDB_DIR = "db/pdb";
	public static final String DEFAULT_ID_MAP_FILE = DEFAULT_PDB_DIR + "/idMap_pdbId_ensemblId_refseqId.txt.gz";
	public static final String DEFAULT_INTERACT_FILE = DEFAULT_PDB_DIR + "/pdbCompoundLines.txt";
	public static final String PDB_EXT = ".ent";
	public static final String PDB_EXT_GZ = ".ent.gz";
	public static final String[] PDB_EXTS = { PDB_EXT_GZ, PDB_EXT };
	public static final String PROTEIN_INTERACTION_FILE = "interactions.bin";

	public static final String UNIPROT_DATABASE = "UNP";

	public static final double DEFAULT_DISTANCE_THRESHOLD = 3.0; // Maximum distance to be considered 'in contact'
	public static final double DEFAULT_MAX_MISMATCH_RATE = 0.1;
	public static final int DEFAULT_PDB_MIN_AA_SEPARATION = 20; // Number of AA of distance within a sequence to
																// consider them for distance analysis
	public static final String DEFAULT_PDB_ORGANISM_COMMON = "HUMAN"; // PDB organism
	public static final String DEFAULT_PDB_ORGANISM_SCIENTIFIC = "HOMO SAPIENS";
	public static final double DEFAULT_PDB_RESOLUTION = 3.0; // PDB file resolution (in Angstrom)

	public static final ArrayList EMPTY_DISTANCES = new ArrayList<>();

	String idMapFile = DEFAULT_ID_MAP_FILE;
	String interactListFile = DEFAULT_ID_MAP_FILE;
	String pdbDir = DEFAULT_PDB_DIR;
	String pdbOrganismCommon = DEFAULT_PDB_ORGANISM_COMMON; // PDB organism "common name"
	String pdbOrganismScientific = DEFAULT_PDB_ORGANISM_SCIENTIFIC; // PDB organism "scientific name"
	double pdbResolution = DEFAULT_PDB_RESOLUTION; // PDB file resolution (in Angstrom)
	double maxMismatchRate = DEFAULT_MAX_MISMATCH_RATE;
	double distanceThreshold = DEFAULT_DISTANCE_THRESHOLD;
	double distanceThresholdNon = Double.POSITIVE_INFINITY; // Distance threshold for 'not in contact'
	int aaMinSeparation = DEFAULT_PDB_MIN_AA_SEPARATION;
	int countFilesPass, countMapError, countMapOk;
	IdMapper idMapper;
	IdMapper idMapperConfirmed;
	PDBFileReader pdbreader;
	Set confirmedPdbChainsMappings; // A Set of all 'pbdId:chainId' mappings that have been confirmed with the
											// reference genome
	Map trancriptById; // Transcripts by id (trId without version number)
	Collection pdbFileNames; // PDB files to porcess
	BufferedWriter outpufFile;
	Set saved;
	List distanceResults;

	public SnpEffCmdPdb() {
	}

	/**
	 * Get AA sequence
	 */
	String aaSequence(Chain chain) {
		// AA sequence
		List aas = aminoAcids(chain);
		StringBuilder sb = new StringBuilder();

		for (AminoAcid aa1 : aas)
			sb.append(aa1.getAminoType());

		return sb.toString();
	}

	/**
	 * Get all AAs in a chain
	 */
	List aminoAcids(Chain chain) {
		ArrayList aas = new ArrayList();
		for (Group group : chain.getAtomGroups())
			if (group instanceof AminoAcid)
				aas.add((AminoAcid) group);
		return aas;
	}

	/**
	 * Get NIPROT IDs from PDB structure
	 */
	Map chainUniprotIds(Structure pdbStruct) {
		Map chain2uniproId = new HashMap();
		for (DBRef dbref : pdbStruct.getDBRefs()) {
			if (debug)
				Gpr.debug("PDB_DBREF\tchain:" + dbref.getChainName() + "\tdb: " + dbref.getDatabase() + "\tID: "
						+ dbref.getDbAccession());
			if (dbref.getDatabase().equals(UNIPROT_DATABASE))
				chain2uniproId.put(dbref.getChainName(), dbref.getDbAccession());
		}
		return chain2uniproId;
	}

	/**
	 * Check that protein sequences match between a PDB 'chain' and the refrence
	 * genome Return a list of maps that are confirmed (i.e. AA sequence matches
	 * between transcript and PDB) Note: Only part of the sequence usually matches
	 * (PDB chains shorter than the transcript)
	 */
	List checkSequencePdbGenome(Structure pdbStruct, Chain chain, String trId,
			List idmapsOri) {
		List idmapsNew = new ArrayList<>();
		String pdbId = pdbStruct.getPDBCode();

		// Does chain pass filter criteria?
		if (!filterPdbChain(chain))
			return idmapsNew;

		// Transcript
		Transcript tr = trancriptById.get(trId);
		String prot = tr.protein();
		if (debug)
			System.err.println("\tTranscript ID: " + tr.getId() + "\tProtein [" + prot.length() + "]: " + prot);

		// Compare sequence to each AA-Chain
		StringBuilder sb = new StringBuilder();
		int countMatch = 0, countMismatch = 0;

		// Count differences
		for (Group group : chain.getAtomGroups())
			if (group instanceof AminoAcid) {
				AminoAcid aa = (AminoAcid) group;
				int aaPos = aa.getResidueNumber().getSeqNum() - 1;
				if (aaPos < 0)
					continue; // I don't know why some PDB coordinates are negative...

				char aaLetter = aa.getChemComp().getOne_letter_code().charAt(0);
				if (prot.length() > aaPos) {
					char trAaLetter = prot.charAt(aaPos);
					if (aaLetter == trAaLetter)
						countMatch++;
					else
						countMismatch++;
				} else
					countMismatch++;
				sb.append(aa.getChemComp().getOne_letter_code());
			}

		// Only use mappings that have low error rate
		if (countMatch + countMismatch > 0) {
			double err = countMismatch / ((double) (countMatch + countMismatch));
			if (debug)
				Gpr.debug("\tChain: " + chain.getId() + "\terror: " + err + "\t" + sb);

			if (err < maxMismatchRate) {
				if (debug)
					Gpr.debug("\tMapping OK    :\t" + trId + "\terror: " + err);

				int trAaLen = tr.protein().length();
				int pdbAaLen = chain.getAtomGroups(GroupType.AMINOACID).size();

				for (IdMapperEntry idm : idmapsOri) {
					if (trId.equals(idm.trId) && pdbId.equals(idm.pdbId)) {
						idmapsNew.add(idm.cloneAndSet(chain.getId(), pdbAaLen, trAaLen));
						break;
					}
				}
			} else if (debug)
				Gpr.debug("\tMapping ERROR :\t" + trId + "\terror: " + err);
		}

		return idmapsNew;
	}

	/**
	 * Check that protein sequences form PDB (pdbFile) matches sequences from Genome
	 * Return a list of maps that are confirmed (i.e. AA sequence matches between
	 * transcript and PDB)
	 */
	List checkSequencePdbGenome(Structure pdbStruct, Set trIds) {
		// Check idMaps. Only return those that match
		ArrayList list = new ArrayList();
		for (String trId : trIds)
			if (filterTranscript(trId)) {
				list.addAll(checkSequencePdbGenome(pdbStruct, trId));
			}

		return list;
	}

	/**
	 * Check that protein sequences match between PDB and Genome Return a list of
	 * maps that are confirmed (i.e. AA sequence matches between transcript and PDB)
	 * Note: Only part of the sequence usually matches (PDB chains shorter than the
	 * transcript)
	 */
	List checkSequencePdbGenome(Structure pdbStruct, String trId) {
		String pdbId = pdbStruct.getPDBCode();
		if (debug)
			System.err.println("\nChecking '" + trId + "'\t<->\t'" + pdbStruct.getPDBCode() + "'");
		List idmapsOri = idMapper.getByPdbId(pdbId);
		List idmapsNew = new ArrayList<>();

		// Compare each chain in the PDB structure
		for (Chain chain : pdbStruct.getChains())
			idmapsNew.addAll(checkSequencePdbGenome(pdbStruct, chain, trId, idmapsOri));

		// Show all confirmed mappings
		if (debug) {
			for (IdMapperEntry ime : idmapsNew)
				Gpr.debug(ime);
		}

		return idmapsNew;
	}

	void closeOuptut() {
		try {
			if (outpufFile != null)
				outpufFile.close();
			outpufFile = null;
			saved = null;
		} catch (IOException e) {
			throw new RuntimeException("Error closing output file", e);
		}
	}

	/**
	 * Map transcript ID to transcript
	 */
	void createTranscriptMap() {
		// Create transcript map
		trancriptById = new HashMap<>();
		for (Gene g : config.getSnpEffectPredictor().getGenome().getGenes()) {
			for (Transcript tr : g) {
				String trId = IdMapper.transcriptIdNoVersion(tr.getId());
				trancriptById.put(trId, tr);
			}
		}
	}

	void deleteOuptut(String outputPdbFile) {
		File of = new File(outputPdbFile);
		of.delete();
	}

	/**
	 * Minimum distance between all atoms in two amino acids
	 */
	double distanceMin(AminoAcid aa1, AminoAcid aa2) {
		double distMin = Double.POSITIVE_INFINITY;

		for (Atom atom1 : aa1.getAtoms())
			for (Atom atom2 : aa2.getAtoms()) {
				double dist = Calc.getDistance(atom1, atom2);
				distMin = Math.min(distMin, dist);
			}

		return distMin;
	}

	/**
	 * Parse PDB id from PDB file name
	 */
	String fileName2PdbId(String pdbFileName) {
		String base = Gpr.baseName(pdbFileName);
		if (base.startsWith("pdb"))
			base = base.substring(3);
		base = Gpr.removeExt(base, PDB_EXTS);
		return base.toUpperCase();
	}

	/**
	 * Return true if the PDB structure passes the filter criteria I.e.: Resolution
	 * less or equal to desire one and species matches
	 */
	boolean filterPdb(Structure pdbStruct) {
		// Within resolution limits? => Process
		double res = pdbStruct.getPDBHeader().getResolution();
		if (res > pdbResolution) {
			if (debug)
				Gpr.debug("PDB resolution is " + res + ", ignoring file");
			return false;
		}

		// Match organism (any chain)
		boolean ok = false;
		for (Chain chain : pdbStruct.getChains())
			ok |= filterPdbChain(chain);

		return ok;
	}

	/**
	 * Return true if the PDB's chain passes the criteria I.e.: Organism matches
	 */
	boolean filterPdbChain(Chain chain) {
		// note: Compound is replaced by EntityInfo in biojava 5.x
		for (EntityInfo entityInfo : chain.getStructure().getEntityInfos()) {
			if (contains(entityInfo.getOrganismCommon(), pdbOrganismCommon)
					|| contains(entityInfo.getOrganismScientific(), pdbOrganismScientific)) {
				return true;
			}
		}
		return false;
	}

	/**
	 * Return true if the transcript passes the criteria (i.e. the ID is present in
	 * 'trancriptById' map)
	 */
	boolean filterTranscript(String trId) {
		Transcript tr = trancriptById.get(trId);
		if (tr == null) {
			if (debug)
				Gpr.debug("Transcript '" + trId + "' not found in " + genomeVer + ".");
			return false;
		}

		return true;
	}

	/**
	 * Analyze interacting sites in a pdb structure
	 */
	List findInteractingCompound(Structure pdbStruct, Chain chain1, Chain chain2, String trId1,
			String trId2) {
		ArrayList results = new ArrayList<>();

		Transcript tr1 = getTranscript(trId1);
		Transcript tr2 = getTranscript(trId2);
		List aas1 = aminoAcids(chain1);
		List aas2 = aminoAcids(chain2);

		// Find between chain interactions
		for (AminoAcid aa1 : aas1) {
			for (AminoAcid aa2 : aas2) {
				double dmin = distanceMin(aa1, aa2);
				if (select(dmin)) {
					DistanceResult dres = new DistanceResult(aa1, aa2, tr1, tr2, dmin);
					if (dres.hasValidCoords()) {
						results.add(dres);
						countMapOk++;
						if (debug)
							Gpr.debug(((dmin <= distanceThreshold) ? "AA_IN_CONTACT\t" : "AA_NOT_IN_CONTACT\t") + dres);
					} else {
						countMapError++;
					}
				}
			}
		}

		return results;
	}

	/**
	 * Find interacting AA within s same chain (i.e. two amino acids in close
	 * proximity within the same chain)
	 */
	List findInteractingSingle(Chain chain, Transcript tr) {
		ArrayList results = new ArrayList<>();
		List aas = aminoAcids(chain);

		for (int i = 0; i < aas.size(); i++) {
			int minj = i + aaMinSeparation;

			for (int j = minj; j < aas.size(); j++) {
				AminoAcid aa1 = aas.get(i);
				AminoAcid aa2 = aas.get(j);
				double d = distanceMin(aa1, aa2);

				if (select(d)) {
					DistanceResult dres = new DistanceResult(aa1, aa2, tr, tr, d);
					if (dres.hasValidCoords()) {
						results.add(dres);
						countMapOk++;
						if (debug)
							Gpr.debug(((d <= distanceThreshold) ? "AA_IN_CONTACT\t" : "AA_NOT_IN_CONTACT\t") + dres);
					} else {
						countMapError++;
					}
				}
			}
		}

		return results;
	}

	/**
	 * Distances within all chains in a structure
	 */
	List findInteractingSingle(Structure structure, Transcript tr) {
		ArrayList results = new ArrayList<>();

		// Distance
		for (Chain chain : structure.getChains())
			if (filterPdbChain(chain)) {
				results.addAll(findInteractingSingle(chain, tr));
			}

		return results;
	}

	Collection findPdbFiles() {
		return findPdbFiles(new File(pdbDir));
	}

	/**
	 * Find all files (in any subdir) matching pdb entry extension
	 */
	Collection findPdbFiles(File dir) {
		if (debug)
			Gpr.debug("Finding PDB files in directory: " + dir);
		List list = new LinkedList<>();

		if (!dir.isDirectory())
			throw new RuntimeException("No such directory '" + dir + "'");

		for (File f : dir.listFiles()) {
			String fileName = f.getName();
			if (f.isDirectory()) {
				list.addAll(findPdbFiles(f));
			} else if (f.isFile() && (fileName.endsWith(PDB_EXT) || fileName.endsWith(PDB_EXT_GZ))) {
				list.add(f.getAbsolutePath());
				if (debug)
					Gpr.debug("Found PDB file: " + f.getAbsolutePath());
			}
		}

		return list;
	}

	Set findTranscriptIds(String pdbId) {
		// Get transcript IDs
		List idEntries = idMapper.getByPdbId(pdbId);
		Set trIds = IdMapper.transcriptIds(idEntries);

		if (debug) {
			StringBuilder sb = new StringBuilder();
			sb.append("PDB ID: " + pdbId);
			sb.append("\tEntries:\n");
			if (idEntries != null) {
				for (IdMapperEntry ime : idEntries)
					sb.append("\t\t" + ime + "\n");
				sb.append("\tTranscripts:\t" + trIds + "\n");
			}
			Gpr.debug(sb);
		}

		return trIds;
	}

	public List getDistanceResults() {
		return distanceResults;
	}

	Transcript getTranscript(String trId) {
		return trancriptById.get(IdMapper.transcriptIdNoVersion(trId));
	}

	/**
	 * Filter IdMaps for a specific chain
	 */
	List idMapChain(Structure pdbStruct, Chain chain, List idMaps) {
		List idMapChain = new ArrayList<>();
		for (IdMapperEntry idmap : idMaps) {
			if (idmap.pdbId.equals(pdbStruct.getPDBCode()) //
					&& idmap.pdbChainId.equals(chain.getId()) //
			) {
				idMapChain.add(idmap);
			}
		}

		return idMapChain;
	}

	/**
	 * Load all data
	 */
	public void initialize() {
		// ---
		// Initialize SnpEff
		// ---
		String argsSnpEff[] = { "eff", "-c", configFile, genomeVer };
		args = argsSnpEff;
		setGenomeVer(genomeVer);
		parseArgs(argsSnpEff);
		loadConfig();

		// Load SnpEff database
		if (genomeVer != null) {
			Timer.showStdErr("Loading SnpEff's database: " + genomeVer);
			loadDb();
			Timer.showStdErr("Done.");
		}

		// Initialize trancriptById
		trancriptById = new HashMap<>();
		for (Gene g : config.getSnpEffectPredictor().getGenome().getGenes())
			for (Transcript tr : g) {
				String id = tr.getId();
				if (id.indexOf('.') > 0)
					id = id.substring(0, id.indexOf('.')); // When using RefSeq transcripts, we don't store sub-version
															// number

				if (trancriptById.containsKey(id)) {
					// There is already a transcript?
					// Favor shorter chromosome names. E.g.: 'chr6' is better than 'chr6_cox_hap2'
					String chrPrev = trancriptById.get(id).getChromosomeName();
					String chr = tr.getChromosomeName();

					if (chr.length() < chrPrev.length())
						trancriptById.put(id, tr);
				} else {
					// Transcript not present: Add it
					trancriptById.put(id, tr);
				}
			}

		// ---
		// Initialize reader
		// ---
		pdbreader = new PDBFileReader();
	}

	boolean isCompound(Structure pdbStruct) {
		List compounds = pdbStruct.getEntityInfos();
		return compounds != null && !compounds.isEmpty();
	}

	public void loadIdMapper() {
		if (verbose)
			Timer.showStdErr("Loading id maps " + idMapFile);
		idMapper = new IdMapper();
		idMapper.setVerbose(verbose);
		idMapper.load(idMapFile);
	}

	/**
	 * Open output file
	 */
	void openOuptut(String outputPdbFile) {
		try {
			if (verbose)
				Timer.showStdErr("Saving results to database file '" + outputPdbFile + "'");
			outpufFile = new BufferedWriter(
					new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(new File(outputPdbFile)))));
			saved = new HashSet();
		} catch (IOException e) {
			throw new RuntimeEOFException("Error opening output file '" + outputPdbFile + "'", e);
		}
	}

	/**
	 * Parse command line arguments
	 */
	@Override
	public void parseArgs(String[] args) {
		if (args == null)
			return;

		this.args = args;

		for (int i = 0; i < args.length; i++) {
			String arg = args[i];

			// Is it a command line option?
			// Note: Generic options (such as config, verbose, debug, quiet, etc.) are
			// parsed by SnpEff class
			// ---
			if (isOpt(arg)) {
				arg = arg.toLowerCase();

				switch (arg.toLowerCase()) {
				case "-aasep":
					if ((i + 1) < args.length)
						aaMinSeparation = Gpr.parseIntSafe(args[++i]);
					else
						usage("Missing parameter in '-aaSep'");
					break;

				case "-idmap":
					if ((i + 1) < args.length)
						idMapFile = args[++i];
					else
						usage("Missing parameter in '-idMap'");
					break;

				case "-interactlist":
					if ((i + 1) < args.length)
						idMapFile = args[++i];
					else
						usage("Missing parameter in '-interactList'");
					break;

				case "-maxdist":
					if ((i + 1) < args.length)
						distanceThreshold = Gpr.parseDoubleSafe(args[++i]);
					else
						usage("Missing parameter in '-maxDist'");
					break;

				case "-maxerr":
					if ((i + 1) < args.length)
						maxMismatchRate = Gpr.parseDoubleSafe(args[++i]);
					else
						usage("Missing parameter: '-maxErr'");
					break;

				case "-org":
					if ((i + 1) < args.length)
						pdbOrganismCommon = args[++i].toUpperCase();
					else
						usage("Missing parameter in '-org'");
					break;

				case "-orgscientific":
					if ((i + 1) < args.length)
						pdbOrganismScientific = args[++i].toUpperCase();
					else
						usage("Missing parameter in '-orgScientific'");
					break;

				case "-pdbdir":
					if ((i + 1) < args.length)
						pdbDir = args[++i];
					else
						usage("Missing parameter in '-pdbDir'");
					break;

				case "-res":
					if ((i + 1) < args.length)
						pdbResolution = Gpr.parseDoubleSafe(args[++i]);
					else
						usage("Missing parameter: '-res'");
					break;

				default:
					usage("Unknown option '" + arg + "'");
				}
			} else if (genomeVer == null || genomeVer.isEmpty())
				genomeVer = arg;
		}

		// ---
		// Sanity checks
		// ---

		// Check: Do we have all required parameters?
		if (genomeVer == null || genomeVer.isEmpty())
			usage("Missing genomer_version parameter");

		if (distanceThreshold <= 0)
			usage("Max distance in '-maxdist' command line option must be a positive number");
		if (maxMismatchRate <= 0)
			usage("Max mismatch rate in '-maxErr' command line option must be a positive number");
		if (pdbResolution <= 0)
			usage("Resoluton in '-res' command line option must be a positive number");
		if (aaMinSeparation <= 0)
			usage("Minimum separation in '-aaSep' command line option must be a positive, integer number");
	}

	/**
	 * PDB analysis
	 */
	public void pdb() {
		// Find all pdb files
		if (verbose)
			Timer.showStdErr("Finding PDB files");
		pdbFileNames = findPdbFiles();

		// Create transcript map
		createTranscriptMap();

		// Map IDs and confirm that amino acid sequence matches (within certain error
		// rate)
		if (verbose)
			Timer.showStdErr("Analyzing PDB sequences");
		pdbAnalysis();
		closeOuptut();
	}

	/**
	 * Check that protein sequences form PDB matches sequences from Genome Return an
	 * IdMapped of confirmed entries (i.e. AA sequence matches between transcript
	 * and PDB)
	 */
	protected void pdbAnalysis() {
		if (verbose)
			Timer.showStdErr("Analyzing PDB files");

		for (String pdbFileName : pdbFileNames)
			pdbAnalysis(pdbFileName);

		if (verbose)
			Timer.showStdErr("Done." //
					+ "\n\tNumber of PDB files : " + pdbFileNames.size() //
					+ "\n\tPDB files analyzed  : " + countFilesPass //
					+ "\n\tAA 'in contact'     : " + countMapOk //
					+ "\n\tMapping errors      : " + countMapError //
			);
	}

	/**
	 * Analyze a PDB file
	 */
	protected void pdbAnalysis(String pdbFileName) {
		// Get Pdb ID from file name
		String pdbId = fileName2PdbId(pdbFileName);

		// Find transcript IDs
		Set trIds = findTranscriptIds(pdbId);
		if (trIds == null || trIds.isEmpty()) {
			if (debug)
				Gpr.debug("No transcript IDs found for PDB entry '" + pdbId + "'");
			return;
		}

		// Read PDB structure
		Structure pdbStruct = readPdbFile(pdbFileName);
		if (pdbStruct == null || !filterPdb(pdbStruct))
			return; // Passes filter?

		// Single protein analysis
		pdbAnalysisSingle(pdbStruct, trIds);

		// Compound protein analysis
		if (isCompound(pdbStruct))
			pdbAnalysisCompound(pdbStruct, trIds);
	}

	/**
	 * Interaction analysis of PDB compounds (co-crystalized molecules)
	 */
	void pdbAnalysisCompound(Structure pdbStruct, Set trIds) {
		countFilesPass++;
		List idMapConfirmed = checkSequencePdbGenome(pdbStruct, trIds);
		if (idMapConfirmed == null || idMapConfirmed.isEmpty())
			return;

		// Get uniprot references
		Map chain2uniproId = chainUniprotIds(pdbStruct);

		// Analyze distance between amino acids in different chains
		for (Chain chain1 : pdbStruct.getChains()) {

			String chainId1 = chain1.getId();
			List idMapChain1 = idMapChain(pdbStruct, chain1, idMapConfirmed);
			if (idMapChain1.isEmpty()) {
				if (debug)
					Gpr.debug("Empty maps for chain '" + chainId1 + "'");
				continue;
			}

			for (Chain chain2 : pdbStruct.getChains()) {
				String chainId2 = chain2.getId();
				if (chainId1.compareTo(chainId2) >= 0)
					continue; // Only calculate once

				// Compare UNIPROT IDs
				String uniprot1 = chain2uniproId.get(chainId1);
				String uniprot2 = chain2uniproId.get(chainId2);
				if (uniprot1 != null && uniprot2 != null && uniprot1.equals(uniprot2)) {
					if (debug)
						Gpr.debug("Filtering out two chains with same UNIPROT ID: '" + uniprot1);
					continue;
				}

				List idMapChain2 = idMapChain(pdbStruct, chain2, idMapConfirmed);
				if (idMapChain2.isEmpty()) {
					if (debug)
						Gpr.debug("Empty maps for chain '" + chainId2 + "'");
					continue;
				}

				// Find interactions for each transcript pair
				for (IdMapperEntry im1 : idMapChain1) {
					for (IdMapperEntry im2 : idMapChain2) {
						// Don't analyze same transcript (this is done in pdbAnalysisCompoundSingle)
						if (!im1.trId.equals(im2.trId)) {
							List dres = findInteractingCompound(pdbStruct, chain1, chain2, im1.trId,
									im2.trId);
							save(dres);
						}
					}
				}
			}
		}
	}

	/**
	 * Interaction analysis of PDB molecules (within molecule interactions)
	 */
	void pdbAnalysisSingle(Structure pdbStruct, Set trIds) {
		// Check that entries map to the genome
		countFilesPass++;
		List idMapConfirmed = checkSequencePdbGenome(pdbStruct, trIds);
		if (idMapConfirmed == null || idMapConfirmed.isEmpty())
			return;

		// Calculate distances
		for (IdMapperEntry idmap : idMapConfirmed) {
			// Get full transcript ID including version (version numbers are removed in the
			// IdMap)
			Transcript tr = getTranscript(idmap.trId);
			List dres = findInteractingSingle(pdbStruct, tr);
			save(dres);
		}
	}

	/**
	 * Read and parse PDB file
	 */
	public Structure readPdbFile(String pdbFileName) {
		try {
			PdbFile pdbreader = new PdbFile();
			if (verbose)
				Timer.showStdErr("Reading PDB file: " + pdbFileName);
			return pdbreader.getStructure(pdbFileName);
		} catch (IOException e) {
			if (verbose)
				e.printStackTrace();
			return null;
		}
	}

	@Override
	public boolean run() {
		loadIdMapper(); // Load ID map table
		loadConfig(); // Read config file

		// Open output file (save to database)
		// Note: We do this before opening the database, because it removes
		// old interaction files (we don't want to try to load stale or old
		// interactions).
		String outputPdbFile = config.getDirDataGenomeVersion() + "/" + PROTEIN_INTERACTION_FILE;
		deleteOuptut(outputPdbFile);

		loadDb(); // Load database

		// Run analysis
		openOuptut(outputPdbFile);
		pdb();

		return true;
	}

	public boolean run(boolean storeResults) {
		distanceResults = new ArrayList<>();
		run();
		return true;
	}

	/**
	 * Save results
	 */
	void save(List distResults) {
		for (DistanceResult d : distResults)
			try {

				String dstr = d.toString();

				if (!saved.contains(dstr)) {
					outpufFile.write(dstr + "\n");
					saved.add(dstr);
					if (distanceResults != null)
						distanceResults.add(d);
				}
			} catch (IOException e) {
				throw new RuntimeException(e);
			}
	}

	/**
	 * Should this pair of amino acids be selected?
	 */
	boolean select(double d) {
		if (!Double.isInfinite(distanceThreshold))
			return d <= distanceThreshold; // Amino acids in close distance
		if (!Double.isInfinite(distanceThresholdNon))
			return d > distanceThresholdNon;// Amino acids far apart
		throw new RuntimeException("Neither distance is finite!");
	}

	public void setDistanceThresholdNon(double distanceThresholdNon) {
		this.distanceThresholdNon = distanceThresholdNon;
	}

	/**
	 * Show 'usage;' message and exit with an error code '-1'
	 * 
	 * @param message
	 */
	@Override
	public void usage(String message) {
		if (message != null) {
			System.err.println("Error        :\t" + message);
			System.err.println("Command line :\t" + commandLineStr(false) + "\n");
		}

		System.err.println("snpEff version " + VERSION);
		System.err.println("Usage: snpEff pdb [options] genome_version");
		System.err.println("\n");
		System.err.println("\nOptions:");
		System.err.println(
				"\t-aaSep                  : Minimum number of AA of separation within the sequence. Default: "
						+ aaMinSeparation);
		System.err.println(
				"\t-idMap                    : ID map file (i.e. file containing mapping from PDB ID to transcript ID). Default: "
						+ idMapFile);
		System.err.println(
				"\t-interactList             : A file containing protein-protein interations (from PDB co-srystalzed structures). Default: "
						+ interactListFile);
		System.err.println(
				"\t-maxDist                : Maximum distance in Angtrom for any atom in a pair of amino acids to be considered 'in contact'. Default: "
						+ distanceThreshold);
		System.err.println(
				"\t-maxErr                 : Maximum amino acid sequence differece between PDB file and genome. Default: "
						+ maxMismatchRate);
		System.err.println("\t-org                      : Organism 'common name'. Default: " + pdbOrganismCommon);
		System.err.println(
				"\t-orgScientific            : Organism 'scientific name'. Default: " + pdbOrganismScientific);
		System.err
				.println("\t-pdbDir                   : Path to PDB files (files in all sub-dirs are scanned).");
		System.err
				.println("\t-res                    : Maximum PDB file resolution. Default: " + pdbResolution);

		usageGenericAndDb();

		System.exit(-1);
	}

	/**
	 * Return true if s1 is not null and contains s2.
	 *
	 * @param s1
	 *            string
	 * @param s2
	 *            string
	 * @return true if s1 is not null and contains s2
	 */
	private static boolean contains(String s1, String s2) {
		return s1 != null && s1.indexOf(s2) >= 0;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy