com.marklogic.spark.writer.file.ZipFileWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.spark.writer.file;
import com.marklogic.spark.ConnectorException;
import com.marklogic.spark.ContextSupport;
import com.marklogic.spark.Options;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.connector.write.DataWriter;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.util.SerializableConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
public class ZipFileWriter implements DataWriter {
private static final Logger logger = LoggerFactory.getLogger(ZipFileWriter.class);
private final ContextSupport context;
private final SerializableConfiguration hadoopConfiguration;
private final String zipPath;
// These can be instantiated lazily depending on which constructor is used.
private ContentWriter contentWriter;
private ZipOutputStream zipOutputStream;
private int zipEntryCounter;
ZipFileWriter(Map properties, SerializableConfiguration hadoopConfiguration, int partitionId) {
this(properties.get("path"), properties, hadoopConfiguration, partitionId, true);
}
public ZipFileWriter(String path, Map properties, SerializableConfiguration hadoopConfiguration,
int partitionId, boolean createZipFileImmediately) {
this.zipPath = makeFilePath(path, partitionId);
this.context = new ContextSupport(properties);
this.hadoopConfiguration = hadoopConfiguration;
if (createZipFileImmediately) {
createZipFileAndContentWriter();
}
}
@Override
public void write(InternalRow row) throws IOException {
if (contentWriter == null) {
createZipFileAndContentWriter();
}
final String uri = row.getString(0);
final String entryName = FileUtil.makePathFromDocumentURI(uri);
writeMetadataEntryIfNecessary(row, uri, entryName);
zipOutputStream.putNextEntry(new ZipEntry(entryName));
this.contentWriter.writeContent(row, zipOutputStream);
zipEntryCounter++;
}
@Override
public void close() {
IOUtils.closeQuietly(zipOutputStream);
}
@Override
public WriterCommitMessage commit() {
return new ZipCommitMessage(zipPath, zipEntryCounter);
}
@Override
public void abort() {
// No action to take.
}
private void createZipFileAndContentWriter() {
Path filePath = new Path(zipPath);
if (logger.isDebugEnabled()) {
logger.debug("Will write to: {}", filePath);
}
this.contentWriter = new ContentWriter(context.getProperties());
try {
FileSystem fileSystem = filePath.getFileSystem(hadoopConfiguration.value());
fileSystem.setWriteChecksum(false);
zipOutputStream = new ZipOutputStream(fileSystem.create(filePath, true));
} catch (IOException e) {
throw new ConnectorException("Unable to create stream for writing zip file: " + e.getMessage(), e);
}
}
private void writeMetadataEntryIfNecessary(InternalRow row, String uri, String entryName) throws IOException {
if (this.context.isStreamingFiles() && context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
zipOutputStream.putNextEntry(new ZipEntry(entryName + ".metadata"));
this.contentWriter.writeMetadataWhileStreaming(uri, zipOutputStream);
zipEntryCounter++;
} else if (hasMetadata(row)) {
zipOutputStream.putNextEntry(new ZipEntry(entryName + ".metadata"));
this.contentWriter.writeMetadata(row, zipOutputStream);
zipEntryCounter++;
}
}
private boolean hasMetadata(InternalRow row) {
return !row.isNullAt(3) || !row.isNullAt(4) || !row.isNullAt(5) || !row.isNullAt(6) || !row.isNullAt(7);
}
/**
* Copies some of what MLCP's ArchiveWriter does, but does not create a zip file per document type. The reason
* for that behavior in MLCP isn't known. It would not help for importing the zip files, where the URI extension will
* determine the document type. And it seems like unnecessary zip file generation - i.e. if a user wants to export
* 10k XML and JSON docs in the same collection, getting 2 zip files instead of 1 seems like surprising behavior.
* Additionally, a user can arrive at that outcome if desired by using Spark to repartion the dataset based on
* the "format" column.
*
* @param path
* @param partitionId
* @return
*/
private String makeFilePath(String path, int partitionId) {
final String timestamp = new SimpleDateFormat("yyyyMMddHHmmssZ").format(new Date());
return String.format("%s%s%s-%d.zip", path, File.separator, timestamp, partitionId);
}
public String getZipPath() {
return zipPath;
}
}