org.biojava.spark.data.AtomData Maven / Gradle / Ivy
package org.biojava.spark.data;
import java.util.Map;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.biojava.nbio.structure.Atom;
import org.biojava.spark.utils.BiojavaSparkUtils;
/**
* A wrapper around {@link JavaRDD} and {@link Dataset} of atoms.
* @author Anthony Bradley
*
*/
public class AtomData {
private Dataset atomDataset;
private JavaRDD atomRdd;
/**
* Construct from an {@link JavaRDD}
* @param atomRdd the input {@link JavaRDD}
*/
public AtomData(JavaRDD atomRdd) {
this.atomRdd = atomRdd;
}
/**
* Construct from a {@link Dataset}
* @param atomDataset the input {@link Dataset}
*/
public AtomData(Dataset atomDataset) {
this.atomDataset = atomDataset;
}
/**
* Get the underlying {@link JavaRDD} for this {@link AtomDataRDD}.
* @return the underlying {@link JavaRDD} for this {@link AtomDataRDD}
*/
public JavaRDD getRdd() {
return atomRdd;
}
/**
* Cache the data - for multi-processing.
*/
public void cacheData() {
atomDataset = atomDataset.cache();
}
/**
* Count the number of times each element appears.
* @return the map of element names (e.g. Ca for Calcium)
* and the number of times they appear in the RDD
*/
public Map countByElement() {
return atomRdd
.map(t -> t.getElement().toString())
.countByValue();
}
/**
* Count the number of times each atom name appears.
* @return the map of element names (e.g. CA for C-alpha)
* and the number of times they appear in the RDD
*/
public Map countByAtomName() {
return atomRdd
.map(t -> t.getName())
.countByValue();
}
/**
* Get the unique group atom name combinations in this.
* @return the map of counts by a given atom name
*/
public Map countByGroupAtomName() {
return atomRdd
.map(t -> BiojavaSparkUtils.getGroupAtomName(t))
.countByValue();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy