org.biojava.spark.data.AtomData Maven / Gradle / Ivy

Go to download
package org.biojava.spark.data;

import java.util.Map;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.biojava.nbio.structure.Atom;
import org.biojava.spark.utils.BiojavaSparkUtils;

/**
 * A wrapper around {@link JavaRDD} and {@link Dataset} of atoms.
 * @author Anthony Bradley
 *
 */
public class AtomData {

	private Dataset atomDataset;
	private JavaRDD atomRdd;

	/**
	 * Construct from an {@link JavaRDD} 
	 * @param atomRdd the input {@link JavaRDD}
	 */
	public AtomData(JavaRDD atomRdd) {
		this.atomRdd = atomRdd;
	}

	/**
	 * Construct from a {@link Dataset}
	 * @param atomDataset the input {@link Dataset}
	 */
	public AtomData(Dataset atomDataset) {
		this.atomDataset = atomDataset;
	}
	
	/**
	 * Get the underlying {@link JavaRDD} for this {@link AtomDataRDD}.
	 * @return the underlying {@link JavaRDD} for this {@link AtomDataRDD}
	 */
	public JavaRDD getRdd() {
		return atomRdd;
		
	}

	/**
	 * Cache the data - for multi-processing.
	 */
	public void cacheData() {
		atomDataset = atomDataset.cache();
	}


	/**
	 * Count the number of times each element appears.
	 * @return the map of element names (e.g. Ca for Calcium)
	 * and the number of times they appear in the RDD
	 */
	public  Map  countByElement() {
		return atomRdd
				.map(t -> t.getElement().toString())
				.countByValue();
	}

	/**
	 * Count the number of times each atom name appears.
	 * @return the map of element names (e.g. CA for C-alpha)
	 * and the number of times they appear in the RDD
	 */
	public  Map   countByAtomName() {
		return atomRdd
				.map(t -> t.getName())
				.countByValue();
	}
	
	/**
	 * Get the unique group atom name combinations in this.
	 * @return the map of counts by a given atom name
	 */
	public Map countByGroupAtomName() {
		return atomRdd
				.map(t -> BiojavaSparkUtils.getGroupAtomName(t))
				.countByValue();
	}

	



}