au.csiro.pathling.io.CatalogPersistence Maven / Gradle / Ivy
package au.csiro.pathling.io;
import au.csiro.pathling.encoders.EncoderBuilder;
import io.delta.tables.DeltaMergeBuilder;
import io.delta.tables.DeltaTable;
import jakarta.annotation.Nonnull;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalog.Catalog;
import org.apache.spark.sql.catalog.Table;
import org.hl7.fhir.r4.model.Enumerations.ResourceType;
import scala.collection.JavaConverters;
/**
* A persistence scheme that stores Delta tables using the Spark catalog.
*
* @author John Grimes
*/
public class CatalogPersistence implements PersistenceScheme {
private final Map LOWER_CASE_RESOURCE_NAMES;
{
// Create a map between a case-insensitive representation of the resource type name and the
// resource type itself.
final Set unsupported = JavaConverters.setAsJavaSet(
EncoderBuilder.UNSUPPORTED_RESOURCES());
LOWER_CASE_RESOURCE_NAMES = Arrays.stream(ResourceType.values())
.filter(resourceType -> !unsupported.contains(resourceType.name()))
.filter(resourceType -> !resourceType.equals(ResourceType.NULL))
.collect(Collectors.toMap(
resourceType -> resourceType.name().toLowerCase(),
resourceType -> resourceType
));
}
@Nonnull
private final SparkSession spark;
@Nonnull
private final Optional schema;
public CatalogPersistence(@Nonnull final SparkSession spark,
@Nonnull final Optional schema) {
this.spark = spark;
this.schema = schema;
}
@Nonnull
@Override
public DeltaTable read(@Nonnull final ResourceType resourceType) {
return DeltaTable.forName(spark, getTableName(resourceType));
}
@Override
public void write(@Nonnull final ResourceType resourceType,
@Nonnull final DataFrameWriter writer) {
writer.saveAsTable(getTableName(resourceType));
}
@Override
public void merge(@Nonnull final ResourceType resourceType,
@Nonnull final DeltaMergeBuilder merge) {
merge.execute();
}
@Override
public boolean exists(@Nonnull final ResourceType resourceType) {
try {
DeltaTable.forName(spark, getTableName(resourceType));
return true;
} catch (final Exception e) {
return false;
}
}
@Override
public void invalidate(@Nonnull final ResourceType resourceType) {
// Do nothing.
}
@Nonnull
@Override
public Set list() {
// Get the list of tables from the catalog, and then filter it to only include those that
// correspond to a known resource type.
return getTables().stream()
.map(Table::name)
.map(String::toLowerCase)
.map(LOWER_CASE_RESOURCE_NAMES::get)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
}
/**
* @param resourceType the resource type to get the table name for
* @return the name of the table for the given resource type, qualified by the specified schema if
* one is provided
*/
@Nonnull
private String getTableName(@Nonnull final ResourceType resourceType) {
return schema.map(s -> String.join(".", s, resourceType.toCode()))
.orElseGet(resourceType::toCode);
}
/**
* @return the list of tables from the catalog, using the specified schema if one is provided
*/
private List getTables() {
final Catalog catalog = spark.catalog();
final Dataset tablesDataset;
// If a schema is provided, use it to list the tables. Otherwise, list tables from the currently
// selected schema.
tablesDataset = schema.map(dbName -> {
try {
return catalog.listTables(dbName);
} catch (final AnalysisException e) {
throw new RuntimeException("Specified schema was not found", e);
}
})
.orElseGet(catalog::listTables);
return tablesDataset.collectAsList();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy