org.rcsb.mmtf.spark.data.StructureDataRDD Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmtf-spark Show documentation
MMTF Spark is a series of libraries and functions for using MMTF with Spark.
There is a newer version: 0.0.8
package org.rcsb.mmtf.spark.data;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;

import javax.vecmath.Point3d;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.partial.BoundedDouble;
import org.apache.spark.partial.PartialResult;
import org.rcsb.mmtf.api.StructureDataInterface;
import org.rcsb.mmtf.encoder.DefaultEncoder;
import org.rcsb.mmtf.serialization.MessagePackSerialization;
import org.rcsb.mmtf.spark.SparkUtils;
import org.rcsb.mmtf.spark.mappers.GenerateSegments;

import scala.Tuple2;

/**
 * A class to provide functions on a series of 
 * {@link StructureDataInterface} objects.
 * @author Anthony Bradley
 *
 */
public class StructureDataRDD {

	/** The RDD of the {@link StructureDataInterface} data. */
	private JavaPairRDD javaPairRdd;


	/**
	 * Empty constructor reads the sample data if {@link SparkUtils} has not been set 
	 * with a path.
	 */
	public StructureDataRDD() {
		setupRdd();
	}
	
	/**
	 * A constructor to download the PDB on construction.
	 * @param download whether to update or not
	 * @throws IOException due to retrieving data from the URL
	 * @throws FileNotFoundException whilst transferring data
	 */
	public StructureDataRDD(boolean download) throws FileNotFoundException, IOException {
		SparkUtils.downloadPdb();
		setupRdd();
	}
	
	/**
	 * Function to setup an RDD of data.
	 */
	private void setupRdd() {
		String filePath = SparkUtils.getFilePath();
		if(filePath==null){
			// First try the full
			System.out.println(SparkUtils.getFullPdbFile());
			if(SparkUtils.getFullPdbFile()!=null && new File(SparkUtils.getFullPdbFile()).exists()) {
				javaPairRdd = SparkUtils.getStructureDataRdd(SparkUtils.getFullPdbFile());
				System.out.println("Using full PDB data.");
			}
			else{
				URL inputPath = SparkUtils.class.getClassLoader().getResource("hadoop/subset");
				// Set the config for the spark context
				javaPairRdd = SparkUtils.getStructureDataRdd(inputPath.toString());
				System.out.println("Full data not available");
				System.out.println("Using small 1% subset");
			}
		}
		else{
			javaPairRdd = SparkUtils.getStructureDataRdd(SparkUtils.getFilePath());
		}		
	}

	/**
	 * Constructor from a file. 
	 * @param inputPath the input path of the Hadoop sequence file to read
	 */
	public StructureDataRDD(String inputPath) {
		// Set the config for the spark context
		javaPairRdd = SparkUtils.getStructureDataRdd(inputPath);
	}

	/**
	 * Constructor from a {@link JavaPairRDD} of {@link String} and {@link StructureDataInterface}.
	 * @param javaPairRDD the input {@link JavaPairRDD} of 
	 * {@link String} {@link StructureDataInterface}
	 */
	public StructureDataRDD(JavaPairRDD javaPairRDD) {
		// Set the config for the spark context
		this.javaPairRdd = javaPairRDD;
	}

	/**
	 * Get the {@link JavaPairRDD} of {@link String} {@link StructureDataInterface}
	 * for this instance
	 * @return the {@link JavaPairRDD} of {@link String} {@link StructureDataInterface}
	 */
	public JavaPairRDD getJavaRdd() {
		return javaPairRdd;
	}

	/**
	 * Filter the {@link StructureDataRDD} based on R-free.
	 * @param maxRFree the maximum allowed R-free
	 * @return the filtered {@link StructureDataRDD}
	 */
	public StructureDataRDD filterRfree(double maxRFree) {
		return new StructureDataRDD(javaPairRdd.filter(t -> t._2.getRfree() t._2.getResolution() result = javaPairRdd
				.countApprox(1000);
		return (long) Integer.parseInt(
				Double.toString(result.getFinalValue().mean()));
	}


	/**
	 * Save the data as a Hadoop sequence file.
	 * @param filePath the path to save the data to
	 */
	public void saveToFile(String filePath) {
		javaPairRdd
		.mapToPair( t -> {
			ByteArrayOutputStream bos = new ByteArrayOutputStream();
			new  MessagePackSerialization().serialize(new DefaultEncoder(t._2).getMmtfEncodedStructure(), bos);
			return new Tuple2(t._1, SparkUtils.gzipCompress(
					bos.toByteArray()));
		})
		.mapToPair(t -> new Tuple2(new Text(t._1), new BytesWritable(t._2)))
		.saveAsHadoopFile(filePath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);
	}


	/**
	 * Allow the user to sample the data.
	 * @param fraction the fraction of data 
	 * to be used (e.g. 0.1 retains 10%)
	 * @return the {@link SegmentDataRDD} updated
	 */
	public StructureDataRDD sample(double fraction) {
		return new StructureDataRDD(javaPairRdd.sample(false, fraction));
	}


}