org.biojava.spark.data.GroupDataRDD Maven / Gradle / Ivy
package org.biojava.spark.data;
import java.util.Map;
import org.apache.spark.api.java.JavaPairRDD;
import org.biojava.nbio.structure.Group;
/**
* An RDD to comprise {@link Group} level data.
* @author Anthony Bradley
*
*/
public class GroupDataRDD {
/** The {@link JavaPairRDD} of {@link Group} to be used internally to the class.
* The String is the name of the Group. */
private JavaPairRDD groupRdd;
/**
* Constructor of the RDD from a {@link JavaPairRDD} of {@link Group}
* @param groupRdd the input {@link JavaPairRDD} of {@link Group}
*/
public GroupDataRDD(JavaPairRDD groupRdd) {
this.groupRdd = groupRdd.cache();
}
/**
* Cache the data - for multi-processing.
*/
public void cacheData() {
this.groupRdd = this.groupRdd.cache();
}
/**
* Get the {@link JavaPairRDD} of {@link Group} data.
* @return the {@link JavaPairRDD} of {@link Group} data
*/
public JavaPairRDD getGroupRdd() {
return groupRdd;
}
/**
* Count the number of times each group name appears.
* @return the map of group names (e.g. LYS for Lysine)
* and the number of times they appear in the RDD
*/
public Map countByGroupName() {
return groupRdd
.map(t -> t._2.getPDBName())
.countByValue();
}
/**
* Get the atoms from the groups.
* @return the atoms for all the groups
*/
public AtomData getAtoms() {
return new AtomData(
groupRdd
.flatMap(t -> t._2.getAtoms()));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy