All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.structure.cluster.SubunitExtractor Maven / Gradle / Ivy

/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.cluster;

import org.biojava.nbio.structure.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * The SubunitExtractor extracts the information of each protein {@link Chain}
 * in a {@link Structure} and converts them into a List of {@link Subunit}.
 *
 * @author Peter Rose
 * @author Aleix Lafita
 * @since 5.0.0
 *
 */
public class SubunitExtractor {

	private static final Logger logger = LoggerFactory
			.getLogger(SubunitExtractor.class);

	/** Prevent instantiation **/
	private SubunitExtractor() {
	}

	/**
	 * Extract the information of each protein Chain in a Structure and converts
	 * them into a List of Subunit. The name of the Subunits is set to
	 * {@link Chain#getId()}.
	 *
	 *
	 * @param structure
	 *            Structure object with protein Chains
	 * @param absMinLen
	 *            {@link SubunitClustererParameters#getAbsoluteMinimumSequenceLength()}
	 * @param fraction
	 *            {@link SubunitClustererParameters#getMinimumSequenceLengthFraction()}
	 * @param minLen
	 *            {@link SubunitClustererParameters#getMinimumSequenceLength()}
	 * @return List of Subunits
	 */
	public static List extractSubunits(Structure structure,
			int absMinLen, double fraction, int minLen) {

		// The extracted subunit container
		List subunits = new ArrayList();

		for (Chain c : structure.getPolyChains()) {
			// Only take protein chains
			if (c.isProtein()) {
				Atom[] ca = StructureTools.getRepresentativeAtomArray(c);
				logger.debug("Chain " + c.getId() + "; CA Atoms: " + ca.length + "; SEQRES: " + c.getSeqResSequence());
				if (ca.length==0)
					continue;
				subunits.add(new Subunit(ca, c.getId(), null, structure));
			}
		}

		// Calculate the minimum length of a Subunit
		int adjustedMinLen = calcAdjustedMinimumSequenceLength(subunits,
				absMinLen, fraction, minLen);
		logger.debug("Adjusted minimum sequence length: " + adjustedMinLen);

		// Filter out short Subunits
		for (int s = subunits.size() - 1; s >= 0; s--) {
			if (subunits.get(s).size() < adjustedMinLen)
				subunits.remove(s);
		}

		return subunits;
	}

	/**
	 * Returns an adapted minimum sequence length. This method ensure that
	 * structure that only have short chains are not excluded by the
	 * minimumSequenceLength cutoff value.
	 *
	 * @return adjustedMinimumSequenceLength
	 */
	private static int calcAdjustedMinimumSequenceLength(
			List subunits, int absMinLen, double fraction, int minLen) {

		int maxLength = Integer.MIN_VALUE;
		int minLength = Integer.MAX_VALUE;

		// Extract the length List, the min and the max
		List lengths = new ArrayList();
		for (int i = 0; i < subunits.size(); i++) {
			if (subunits.get(i).size() >= absMinLen) {
				maxLength = Math.max(subunits.get(i).size(), maxLength);
				minLength = Math.min(subunits.get(i).size(), minLength);
				lengths.add(subunits.get(i).size());

			}
		}

		int adjustedMinimumSequenceLength = minLen;

		if (lengths.size() < 2)
			return adjustedMinimumSequenceLength;

		// Calculate the median of the lengths
		double median = 0;
		Collections.sort(lengths);
		if (lengths.size() % 2 == 1) {
			int middle = (lengths.size() - 1) / 2;
			median = lengths.get(middle);
		} else {
			int middle2 = lengths.size() / 2;
			int middle1 = middle2 - 1;
			median = 0.5 * (lengths.get(middle1) + lengths.get(middle2));
		}

		// If the median * fraction is lower than the minLength
		if (minLength >= median * fraction) {
			adjustedMinimumSequenceLength = Math.min(minLength, minLen);
		}

		return adjustedMinimumSequenceLength;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy