org.rcsb.mmtf.spark.SparkUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmtf-spark Show documentation
MMTF Spark is a series of libraries and functions for using MMTF with Spark.
There is a newer version: 0.0.8
package org.rcsb.mmtf.spark;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPOutputStream;

import javax.vecmath.Point3d;

import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.rcsb.mmtf.api.StructureDataInterface;
import org.rcsb.mmtf.dataholders.MmtfStructure;
import org.rcsb.mmtf.decoder.DefaultDecoder;
import org.rcsb.mmtf.decoder.ReaderUtils;
import org.rcsb.mmtf.serialization.MessagePackSerialization;
import org.rcsb.mmtf.spark.data.AtomSelectObject;
import org.rcsb.mmtf.spark.data.SegmentDataRDD;
import org.rcsb.mmtf.spark.data.StructureDataRDD;
import org.rcsb.mmtf.spark.mappers.FlatMapIntList;
import org.rcsb.mmtf.spark.mappers.MapToPairs;
import org.rcsb.mmtf.utils.CodecUtils;

import scala.Tuple2;

/**
 * A class of Spark utility methods
 * @author Anthony Bradley
 *
 */
public class SparkUtils {

	/** The file path of the Hadoop sequence file to read */
	private static String hadoopFilePath = null;
	private static SparkConf conf = null;
	private static JavaSparkContext javaSparkContext = null;
	/** Where to get the data from. */
	public static final String URL_LOCATION = "http://mmtf.rcsb.org/v0/hadoopfiles/full.tar";
	private static final String hadoopBase = "/hadoop/v0";
	private static final String pdbFileName = "full";
	private static final String tarFileName = "full.tar";

	/**
	 * Get an {@link JavaPairRDD} of {@link String} {@link StructureDataInterface} from a file path.
	 * @param filePath the input path to the hadoop sequence file
	 * @return the {@link JavaPairRDD} of {@link String} {@link StructureDataInterface}
	 */
	public static JavaPairRDD getStructureDataRdd(String filePath) {
		return getSparkContext()
				.sequenceFile(filePath, Text.class, BytesWritable.class, 8)
				// Roughly thirty seconds
				.mapToPair(t -> new Tuple2(t._1.toString(), ReaderUtils.deflateGzip(t._2.getBytes())))
				// Roughly a minute 
				.mapToPair(t -> new Tuple2(t._1, new MessagePackSerialization().deserialize(new ByteArrayInputStream(t._2))))
				// Roughly a minute
				.mapToPair(t -> new Tuple2(t._1,  new DefaultDecoder(t._2)));
	}
	
	/**
	 * Get the {@link StructureDataRDD} from a file path.
	 * @param filePath the input file path
	 * @return the {@link StructureDataRDD} object
	 */
	public static StructureDataRDD getStructureObj(String filePath) {
		return new StructureDataRDD(getStructureDataRdd(filePath));
	}

	/**
	 * Get the {@link SparkConf} for this run.
	 * @return the {@link SparkConf} for this run
	 */
	public static SparkConf getConf() {
		if (conf==null){
			// This is the default 2 line structure for Spark applications
			conf = new SparkConf().setMaster("local[*]")
					.setAppName(SparkUtils.class.getSimpleName()); 
		}
		return conf;

	}

	/**
	 * Get the {@link JavaSparkContext} for this run.
	 * @return the {@link JavaSparkContext} for this run
	 */
	public static JavaSparkContext getSparkContext(){
		if(javaSparkContext==null){
			javaSparkContext = new JavaSparkContext(SparkUtils.getConf());
		}
		return javaSparkContext;
	}


	/**
	 * Get the {@link JavaSparkContext} for this run.
	 * @param conf the {@link SparkConf} to use to setup the context
	 * @return the {@link JavaSparkContext} for this run
	 */
	public static JavaSparkContext getSparkContext(SparkConf conf){
		if(javaSparkContext==null){
			javaSparkContext = new JavaSparkContext(conf);
		}
		return javaSparkContext;
	}

	/**
	 * Gently shutdown at the end of a run.
	 */
	public static void shutdown() {
		javaSparkContext.close();
	}

	/**
	 * Set the file path of the Hadoop file to read.
	 * @param filePath the input file path to read
	 */
	public static void filePath(String filePath) {
		hadoopFilePath = filePath;
	}



	/**
	 * Get the type of a given chain index.
	 * @param structureDataInterface the input {@link StructureDataInterface}
	 * @param chainInd the index of the relevant chain
	 * @return the {@link String} describing the chain 
	 */
	public static String getType(StructureDataInterface structureDataInterface, int chainInd) {
		for(int i=0; i> totalList = new ArrayList<>();
		for(String pdbId : inputIds) {
			totalList.add(new Tuple2(pdbId, getDataAsByteArray(pdbId)));
		}
		// Parrelise and return as RDD
		StructureDataRDD structureDataRDD = new StructureDataRDD(getSparkContext().parallelizePairs(totalList)
				.mapToPair(t -> new Tuple2(t._1.toString(), ReaderUtils.deflateGzip(t._2)))
				// Roughly a minute 
				.mapToPair(t -> new Tuple2(t._1, new MessagePackSerialization().deserialize(new ByteArrayInputStream(t._2))))
				.mapToPair(t -> new Tuple2(t._1,  new DefaultDecoder(t._2))));
		return structureDataRDD.getCalpha();	
	}
	
	
	/**
	 * Helper function to get the data for a PDB id as an gzip compressed byte array.
	 * Data is retrieved from the REST service. This should be moved to mmtf for the next release.
	 * @param pdbCode the input PDB id
	 * @return the gzip compressed byte array for this structure
	 * @throws IOException  due to retrieving data from the URL
	 */
	private static byte[] getDataAsByteArray(String pdbCode) throws IOException {
		
		// Get these as an inputstream
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		InputStream is = null;
		URL url = new URL(CodecUtils.BASE_URL + pdbCode);
		try {
			is = url.openStream();
			byte[] byteChunk = new byte[2048]; // Or whatever size you want to read in at a time.
			int n;
			while ( (n = is.read(byteChunk)) > 0 ) {
				baos.write(byteChunk, 0, n);
			}
		} finally {
			if (is != null) { is.close(); }
		}
		return baos.toByteArray();
	}
	
	/**
	 * Utility function to generate an {@link AtomSelectObject}. 
	 * Mainly for application to the Python API.
	 * @param atomNameList the list of atoms to consider
	 * @param groupNameList the list of groups to consider (e.g. LYS)
	 * @param charged whether to consider charged atoms only (true)
	 * @param elementNameList the list of elements to consider
	 * @return an atom select object of the appropriate type.
	 */
	public AtomSelectObject generateAtomSelectObject(List atomNameList, 
			List groupNameList, boolean charged, List elementNameList) {
		return new AtomSelectObject()
				.atomNameList(atomNameList)
				.charged(charged)
				.groupNameList(groupNameList)
				.elementNameList(elementNameList);
	}

	/**
	 * Get a {@link JavaPairRDD} of Integers to do a half matrix comparison. i.e. all comparisons
	 * where i!=j and i is greather than j
	 * @param numMembers the total number of members to compare
	 * @return the {@link JavaPairRDD} of the comparisons
	 */
	public static JavaPairRDD getComparisonMatrix(int numMembers) {
		JavaRDD singleInt = getSparkContext().parallelize(Arrays.asList(numMembers));
		JavaRDD multipleInts = singleInt.flatMap(new FlatMapIntList());
		JavaPairRDD comparisons = multipleInts.flatMapToPair(new MapToPairs(numMembers));
		return comparisons;
	}
}