
org.rcsb.mmtf.spark.SparkUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmtf-spark Show documentation
Show all versions of mmtf-spark Show documentation
MMTF Spark is a series of libraries and functions for using MMTF
with Spark.
package org.rcsb.mmtf.spark;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import javax.vecmath.Point3d;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.rcsb.mmtf.api.StructureDataInterface;
import org.rcsb.mmtf.dataholders.MmtfStructure;
import org.rcsb.mmtf.decoder.DefaultDecoder;
import org.rcsb.mmtf.decoder.ReaderUtils;
import org.rcsb.mmtf.serialization.MessagePackSerialization;
import org.rcsb.mmtf.spark.data.AtomSelectObject;
import org.rcsb.mmtf.spark.data.SegmentDataRDD;
import org.rcsb.mmtf.spark.data.StructureDataRDD;
import org.rcsb.mmtf.spark.mappers.FlatMapIntList;
import org.rcsb.mmtf.spark.mappers.MapToPairs;
import org.rcsb.mmtf.utils.CodecUtils;
import scala.Tuple2;
/**
* A class of Spark utility methods
* @author Anthony Bradley
*
*/
public class SparkUtils {
/** The file path of the Hadoop sequence file to read */
private static String hadoopFilePath = null;
private static SparkConf conf = null;
private static JavaSparkContext javaSparkContext = null;
/** Where to get the data from. */
public static final String URL_LOCATION = "http://mmtf.rcsb.org/v0/hadoopfiles/full.tar";
private static final String hadoopBase = "/hadoop/v0";
private static final String pdbFileName = "full";
private static final String tarFileName = "full.tar";
/**
* Get an {@link JavaPairRDD} of {@link String} {@link StructureDataInterface} from a file path.
* @param filePath the input path to the hadoop sequence file
* @return the {@link JavaPairRDD} of {@link String} {@link StructureDataInterface}
*/
public static JavaPairRDD getStructureDataRdd(String filePath) {
return getSparkContext()
.sequenceFile(filePath, Text.class, BytesWritable.class, 8)
// Roughly thirty seconds
.mapToPair(t -> new Tuple2(t._1.toString(), ReaderUtils.deflateGzip(t._2.getBytes())))
// Roughly a minute
.mapToPair(t -> new Tuple2(t._1, new MessagePackSerialization().deserialize(new ByteArrayInputStream(t._2))))
// Roughly a minute
.mapToPair(t -> new Tuple2(t._1, new DefaultDecoder(t._2)));
}
/**
* Get the {@link StructureDataRDD} from a file path.
* @param filePath the input file path
* @return the {@link StructureDataRDD} object
*/
public static StructureDataRDD getStructureObj(String filePath) {
return new StructureDataRDD(getStructureDataRdd(filePath));
}
/**
* Get the {@link SparkConf} for this run.
* @return the {@link SparkConf} for this run
*/
public static SparkConf getConf() {
if (conf==null){
// This is the default 2 line structure for Spark applications
conf = new SparkConf().setMaster("local[*]")
.setAppName(SparkUtils.class.getSimpleName());
}
return conf;
}
/**
* Get the {@link JavaSparkContext} for this run.
* @return the {@link JavaSparkContext} for this run
*/
public static JavaSparkContext getSparkContext(){
if(javaSparkContext==null){
javaSparkContext = new JavaSparkContext(SparkUtils.getConf());
}
return javaSparkContext;
}
/**
* Get the {@link JavaSparkContext} for this run.
* @param conf the {@link SparkConf} to use to setup the context
* @return the {@link JavaSparkContext} for this run
*/
public static JavaSparkContext getSparkContext(SparkConf conf){
if(javaSparkContext==null){
javaSparkContext = new JavaSparkContext(conf);
}
return javaSparkContext;
}
/**
* Gently shutdown at the end of a run.
*/
public static void shutdown() {
javaSparkContext.close();
}
/**
* Set the file path of the Hadoop file to read.
* @param filePath the input file path to read
*/
public static void filePath(String filePath) {
hadoopFilePath = filePath;
}
/**
* Get the type of a given chain index.
* @param structureDataInterface the input {@link StructureDataInterface}
* @param chainInd the index of the relevant chain
* @return the {@link String} describing the chain
*/
public static String getType(StructureDataInterface structureDataInterface, int chainInd) {
for(int i=0; i> totalList = new ArrayList<>();
for(String pdbId : inputIds) {
totalList.add(new Tuple2(pdbId, getDataAsByteArray(pdbId)));
}
// Parrelise and return as RDD
StructureDataRDD structureDataRDD = new StructureDataRDD(getSparkContext().parallelizePairs(totalList)
.mapToPair(t -> new Tuple2(t._1.toString(), ReaderUtils.deflateGzip(t._2)))
// Roughly a minute
.mapToPair(t -> new Tuple2(t._1, new MessagePackSerialization().deserialize(new ByteArrayInputStream(t._2))))
.mapToPair(t -> new Tuple2(t._1, new DefaultDecoder(t._2))));
return structureDataRDD.getCalpha();
}
/**
* Helper function to get the data for a PDB id as an gzip compressed byte array.
* Data is retrieved from the REST service. This should be moved to mmtf for the next release.
* @param pdbCode the input PDB id
* @return the gzip compressed byte array for this structure
* @throws IOException due to retrieving data from the URL
*/
private static byte[] getDataAsByteArray(String pdbCode) throws IOException {
// Get these as an inputstream
ByteArrayOutputStream baos = new ByteArrayOutputStream();
InputStream is = null;
URL url = new URL(CodecUtils.BASE_URL + pdbCode);
try {
is = url.openStream();
byte[] byteChunk = new byte[2048]; // Or whatever size you want to read in at a time.
int n;
while ( (n = is.read(byteChunk)) > 0 ) {
baos.write(byteChunk, 0, n);
}
} finally {
if (is != null) { is.close(); }
}
return baos.toByteArray();
}
/**
* Utility function to generate an {@link AtomSelectObject}.
* Mainly for application to the Python API.
* @param atomNameList the list of atoms to consider
* @param groupNameList the list of groups to consider (e.g. LYS)
* @param charged whether to consider charged atoms only (true)
* @param elementNameList the list of elements to consider
* @return an atom select object of the appropriate type.
*/
public AtomSelectObject generateAtomSelectObject(List atomNameList,
List groupNameList, boolean charged, List elementNameList) {
return new AtomSelectObject()
.atomNameList(atomNameList)
.charged(charged)
.groupNameList(groupNameList)
.elementNameList(elementNameList);
}
/**
* Get a {@link JavaPairRDD} of Integers to do a half matrix comparison. i.e. all comparisons
* where i!=j and i is greather than j
* @param numMembers the total number of members to compare
* @return the {@link JavaPairRDD} of the comparisons
*/
public static JavaPairRDD getComparisonMatrix(int numMembers) {
JavaRDD singleInt = getSparkContext().parallelize(Arrays.asList(numMembers));
JavaRDD multipleInts = singleInt.flatMap(new FlatMapIntList());
JavaPairRDD comparisons = multipleInts.flatMapToPair(new MapToPairs(numMembers));
return comparisons;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy