org.hpccsystems.spark.HpccFile Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-hpcc Show documentation
Show all versions of spark-hpcc Show documentation
Spark connector for reading files residing in an HPCC cluster environment
/*******************************************************************************
* HPCC SYSTEMS software Copyright (C) 2018 HPCC Systems®.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package org.hpccsystems.spark;
import java.io.Serializable;
import java.net.MalformedURLException;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.execution.python.EvaluatePython;
import org.hpccsystems.dfs.cluster.RemapInfo;
import org.hpccsystems.dfs.client.DataPartition;
import org.hpccsystems.commons.ecl.FieldDef;
import org.hpccsystems.commons.errors.HpccFileException;
import org.hpccsystems.ws.client.utils.Connection;
/**
* Access to file content on a collection of one or more HPCC
* clusters.
*
*/
public class HpccFile extends org.hpccsystems.dfs.client.HPCCFile implements Serializable {
static private final long serialVersionUID = 1L;
private int recordLimit = -1;
// Make sure Python picklers have been registered
static { EvaluatePython.registerPicklers(); }
/**
* Constructor for the HpccFile.
* Captures HPCC logical file information from the DALI Server
* for the clusters behind the ESP named by the Connection.
*
* @param fileName The HPCC file name
* @param espconninfo The ESP connection info (protocol,address,port,user,pass)
* @throws HpccFileException
*/
public HpccFile(String fileName, Connection espconninfo) throws HpccFileException
{
super(fileName,espconninfo);
}
/**
* Constructor for the HpccFile.
* Captures HPCC logical file information from the DALI Server
* for the clusters behind the ESP named by the Connection.
*
* @param fileName The HPCC file name
* @param connectionString to eclwatch. Format: {http|https}://{HOST}:{PORT}.
* @throws HpccFileException
*/
public HpccFile(String fileName, String connectionString, String user, String pass) throws MalformedURLException, HpccFileException
{
super(fileName,connectionString,user,pass);
}
/**
* Constructor for the HpccFile.
* Captures HPCC logical file information from the DALI Server for the
* clusters behind the ESP named by the IP address and re-maps
* the address information for the THOR nodes to visible addresses
* when the THOR clusters are virtual.
* @param fileName The HPCC file name
* @param targetColumnList a comma separated list of column names in dotted
* notation for columns within compound columns.
* @param filter a file filter to select records of interest
* @param remap_info address and port re-mapping info for THOR cluster
* @param maxParts optional the maximum number of partitions or zero for no max
* @param targetfilecluster optional - the hpcc cluster the target file resides in
* @throws HpccFileException
*/
public HpccFile(String fileName, Connection espconninfo, String targetColumnList, String filter, RemapInfo remap_info, int maxParts, String targetfilecluster) throws HpccFileException
{
super(fileName,espconninfo,targetColumnList,filter,remap_info,maxParts,targetfilecluster);
}
/**
* Set file part record limit
* @param limit
*/
public void setFilePartRecordLimit(int limit)
{
this.recordLimit = limit;
}
/**
* Returns the current file part record limit
* @return
*/
public int getFilePartRecordLimit()
{
return this.recordLimit;
}
/**
* Make a Spark Resilient Distributed Dataset (RDD) that provides access
* to THOR based datasets. Uses existing SparkContext, allows this function
* to be used from PySpark.
* @return An RDD of THOR data.
* @throws HpccFileException When there are errors reaching the THOR data
*/
public HpccRDD getRDD() throws HpccFileException {
return getRDD(SparkContext.getOrCreate());
}
/**
* Make a Spark Resilient Distributed Dataset (RDD) that provides access
* to THOR based datasets.
* @param sc Spark Context
* @return An RDD of THOR data.
* @throws HpccFileException When there are errors reaching the THOR data
*/
public HpccRDD getRDD(SparkContext sc) throws HpccFileException {
return new HpccRDD(sc, getFileParts(), this.getRecordDefinition(), this.getProjectedRecordDefinition(), this.getFileAccessExpirySecs(), this.recordLimit);
}
/**
* Make a Spark Dataframe (Dataset) of THOR data available.
* @param session the Spark Session object
* @return a Dataframe of THOR data
* @throws HpccFileException when htere are errors reaching the THOR data.
*/
public Dataset getDataframe(SparkSession session) throws HpccFileException{
FieldDef originalRD = this.getRecordDefinition();
FieldDef projectedRD = this.getProjectedRecordDefinition();
DataPartition[] fp = this.getFileParts();
JavaRDD rdd = (new HpccRDD(session.sparkContext(), fp, originalRD, projectedRD, this.getFileAccessExpirySecs(), this.recordLimit)).toJavaRDD();
StructType schema = null;
try {
schema = SparkSchemaTranslator.toSparkSchema(projectedRD);
} catch(Exception e) {
throw new HpccFileException(e.getMessage());
}
return session.createDataFrame(rdd, schema);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy