All Downloads are FREE. Search and download functionalities are using the official Maven repository.

templates.data-delivery-pyspark.persist.py.vm Maven / Gradle / Ivy

The newest version!

   #if ($step.isPersistTypeElasticsearch())
     #set ($saveParam = "es_resource: str")
   #elseif ($step.isPersistTypeNeo4j())
     #set ($saveParam = "labels: list")
   #else
     #set ($saveParam = "table_name: str")
   #end
  

   #if ($step.persist.shortCollectionType)
    def save_dataset(self, dataset: ${step.persist.shortCollectionType}[${step.persist.shortRecordType}], ${saveParam}) -> None:
   #elseif ($step.isPersistTypeRdbms())
    def get_data_source_configs(self) -> dict:
        return {
            "url": self.spark_rdbms_config.jdbc_url(),
            "properties": {
                "user": self.spark_rdbms_config.user(),
                "password": self.spark_rdbms_config.password(),
                "driver": self.spark_rdbms_config.jdbc_driver()
            },
        }
    def save_dataset(self, dataset: ${step.persist.shortRecordType}, ${saveParam}) -> None:
        ${step.capitalizedName}Base.logger.info('Saving %s To RDBMS...' % self.descriptive_label)
        mode = "${step.persist.mode}"
        config = self.get_data_source_configs()
        dataset.write.jdbc(
            url=config.get("url"), table=table_name, mode=mode, properties=config.get("properties")
        )
        ${step.capitalizedName}Base.logger.info('Saved %s to RDBMS' % self.descriptive_label)
   #else
    def save_dataset(self, dataset: ${step.persist.shortRecordType}, ${saveParam}) -> None:
   #end
   #if (!$step.isPersistTypeRdbms())
        """
        Saves a dataset.
        """
   #end
      #if ($step.isPersistTypeHive())
        ${step.capitalizedName}Base.logger.info('Saving %s To Hive...' % self.descriptive_label)

        dataset.write.format('hive').mode('${step.persist.mode}').saveAsTable(table_name)

        ${step.capitalizedName}Base.logger.info('Saved %s to Hive' % self.descriptive_label)
      #elseif ($step.isPersistTypeDeltaLake())
        ${step.capitalizedName}Base.logger.info('Saving %s To Delta Lake...' % self.descriptive_label)

        path = self.delta_output_directory + table_name
        dataset.write.format('delta').mode('${step.persist.mode}').save(path)
        # PySpark 3.0 does not have a tableExists method
        if table_name not in [t.name for t in self.spark.catalog.listTables()]:
            self.spark.catalog.createTable(table_name, path, "delta")

        ${step.capitalizedName}Base.logger.info('Saved %s to Delta Lake' % self.descriptive_label)

      #elseif ($step.isPersistTypeElasticsearch())
        ${step.capitalizedName}Base.logger.info('Saving %s To Elasticsearch...' % self.descriptive_label)

        options = self.spark_elasticsearch_config.get_es_configs()

        dataset.write.format('es').options(**options).mode('${step.persist.mode}').save(es_resource)

        ${step.capitalizedName}Base.logger.info('Saved %s to Elasticsearch' % self.descriptive_label)
      #elseif ($step.isPersistTypeNeo4j())
        ${step.capitalizedName}Base.logger.info('Saving %s To Neo4j...' % self.descriptive_label)

        options = self.spark_neo4j_config.get_spark_options()

        dataset.write \
            .format(SparkNeo4jConfig.NEO4J_FORMAT) \
            .options(**options) \
            .option(SparkNeo4jConfig.LABELS_OPTION, ':'.join(labels)) \
            .mode('${step.persist.mode}') \
            .save()

        ${step.capitalizedName}Base.logger.info('Saved %s to Neo4j' % self.descriptive_label)
      #elseif (!$step.isPersistTypeRdbms())
        ${step.capitalizedName}Base.logger.warn('Persist type for test "${step.persist.type}" is not yet supported by the aiSSEMBLE Solution Baseline!')
        ${step.capitalizedName}Base.logger.warn('Please encode persistence logic manually by overriding save_dataset(..)')
      #end




© 2015 - 2025 Weber Informatics LLC | Privacy Policy