Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package au.csiro.pathling.io;
import static java.util.Objects.requireNonNull;
import io.delta.tables.DeltaMergeBuilder;
import io.delta.tables.DeltaTable;
import jakarta.annotation.Nonnull;
import jakarta.annotation.Nullable;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.hl7.fhir.exceptions.FHIRException;
import org.hl7.fhir.r4.model.Enumerations.ResourceType;
/**
* A persistence scheme that stores Delta tables at a nominated file system location.
*
* @author John Grimes
*/
public class FileSystemPersistence implements PersistenceScheme {
@Nonnull
protected final SparkSession spark;
@Nonnull
protected final String path;
public FileSystemPersistence(@Nonnull final SparkSession spark, @Nonnull final String path) {
this.spark = spark;
this.path = convertS3ToS3aUrl(path);
}
@Nonnull
@Override
public DeltaTable read(@Nonnull final ResourceType resourceType) {
return DeltaTable.forPath(spark, getTableUrl(path, resourceType));
}
@Override
public void write(@Nonnull final ResourceType resourceType,
@Nonnull final DataFrameWriter writer) {
writer.save(getTableUrl(path, resourceType));
}
@Override
public void merge(@Nonnull final ResourceType resourceType,
@Nonnull final DeltaMergeBuilder merge) {
merge.execute();
}
@Override
public boolean exists(@Nonnull final ResourceType resourceType) {
return DeltaTable.isDeltaTable(spark, getTableUrl(path, resourceType));
}
@Override
public void invalidate(@Nonnull final ResourceType resourceType) {
}
@Nonnull
@Override
public Set list() {
try {
final Stream files = Stream.of(
getFileSystem(spark, path).listStatus(new Path(path)));
return files
.map(FileStatus::getPath)
.map(Path::getName)
.map(fileName -> fileName.replace(".parquet", ""))
.map(FileSystemPersistence::resourceTypeFromCode)
.filter(Optional::isPresent)
.map(Optional::get)
.collect(Collectors.toSet());
} catch (final IOException e) {
throw new RuntimeException("Problem listing resources", e);
}
}
@Nonnull
private static Optional resourceTypeFromCode(@Nullable final String code) {
try {
return Optional.ofNullable(ResourceType.fromCode(code));
} catch (final FHIRException e) {
return Optional.empty();
}
}
/**
* @param path the URL of the warehouse location
* @param resourceType the resource type to be read or written to
* @return the URL of the resource within the warehouse
*/
@Nonnull
protected static String getTableUrl(@Nonnull final String path,
@Nonnull final ResourceType resourceType) {
return safelyJoinPaths(path, fileNameForResource(resourceType));
}
/**
* @param resourceType A HAPI {@link ResourceType} describing the type of resource
* @return The filename that should be used
*/
@Nonnull
private static String fileNameForResource(@Nonnull final ResourceType resourceType) {
return resourceType.toCode() + ".parquet";
}
/**
* Joins two paths together, taking care that URLs, Unix-style paths and Windows-style paths are
* catered for.
*
* @param first the first path
* @param second the second path
* @return the joined path
*/
public static String safelyJoinPaths(@Nonnull final String first, @Nonnull final String second) {
try {
final URI uri = URI.create(first);
return uri.toString().replaceFirst("/$", "") + "/" + second;
} catch (final IllegalArgumentException e) {
return java.nio.file.Path.of(first, second).toString();
}
}
/**
* Get a Hadoop {@link FileSystem} for the given location.
*
* @param spark the Spark session
* @param location the location URL to be accessed
* @return the {@link FileSystem} for the given location
*/
@Nonnull
public static FileSystem getFileSystem(@Nonnull final SparkSession spark,
@Nonnull final String location) {
@Nullable final org.apache.hadoop.conf.Configuration hadoopConfiguration = spark.sparkContext()
.hadoopConfiguration();
requireNonNull(hadoopConfiguration);
@Nullable final FileSystem warehouseLocation;
try {
warehouseLocation = FileSystem.get(new URI(location), hadoopConfiguration);
} catch (final IOException e) {
throw new RuntimeException("Problem accessing location: " + location, e);
} catch (final URISyntaxException e) {
throw new RuntimeException("Problem parsing URL: " + location, e);
}
requireNonNull(warehouseLocation);
return warehouseLocation;
}
/**
* @param s3Url The S3 URL that should be converted
* @return A S3A URL
*/
@Nonnull
public static String convertS3ToS3aUrl(@Nonnull final String s3Url) {
return s3Url.replaceFirst("s3:", "s3a:");
}
/**
* Convert a directory containing a single file partition into a single file.
*
* @param spark the Spark session
* @param partitionedLocation the location URL containing the partitioned file
* @param departitionedLocation the desired URL of the resulting file
* @param extension the file extension used within the partitioned directory
* @return the location of the resulting file
*/
@Nonnull
public static String departitionResult(@Nonnull final SparkSession spark,
@Nonnull final String partitionedLocation,
@Nonnull final String departitionedLocation, @Nonnull final String extension) {
return departitionResult(getFileSystem(spark, partitionedLocation), partitionedLocation,
departitionedLocation, extension);
}
/**
* Convert a directory containing a single file partition into a single file.
*
* @param partitionedLocation a Hadoop {@link FileSystem} representing the location that both the
* partitioned and departitioned files are located in
* @param partitionedUrl the URL of the partitioned file
* @param departitionedUrl the desired URL of the resulting file
* @param extension the file extension used within the partitioned directory
* @return the location of the resulting file
*/
@Nonnull
public static String departitionResult(@Nonnull final FileSystem partitionedLocation,
@Nonnull final String partitionedUrl, @Nonnull final String departitionedUrl,
@Nonnull final String extension) {
try {
final Path partitionedPath = new Path(partitionedUrl);
final FileStatus[] partitionFiles = partitionedLocation.listStatus(partitionedPath);
final String targetFile = Arrays.stream(partitionFiles)
.map(f -> f.getPath().toString())
.filter(f -> f.endsWith("." + extension))
.findFirst()
.orElseThrow(() -> new IOException("Partition file not found"));
log.info("Renaming result to: " + departitionedUrl);
partitionedLocation.rename(new Path(targetFile), new Path(departitionedUrl));
log.info("Cleaning up: " + partitionedUrl);
partitionedLocation.delete(partitionedPath, true);
} catch (final IOException e) {
throw new RuntimeException("Problem copying partition file", e);
}
return departitionedUrl;
}
}