org.hpccsystems.dfs.client.FileUtility Maven / Gradle / Ivy
/*******************************************************************************
* HPCC SYSTEMS software Copyright (C) 2023 HPCC Systems®.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package org.hpccsystems.dfs.client;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.Arrays;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.hpccsystems.commons.ecl.FieldDef;
import org.json.JSONArray;
import org.json.JSONObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.hpccsystems.ws.client.HPCCWsClient;
import org.hpccsystems.ws.client.platform.Platform;
import org.hpccsystems.ws.client.utils.Connection;
import org.hpccsystems.dfs.cluster.*;
import org.hpccsystems.commons.ecl.RecordDefinitionTranslator;
import org.hpccsystems.commons.errors.HpccFileException;
import org.hpccsystems.ws.client.HPCCWsDFUClient;
import org.hpccsystems.ws.client.wrappers.wsdfu.DFUCreateFileWrapper;
import org.hpccsystems.ws.client.wrappers.wsdfu.DFUFilePartWrapper;
import org.hpccsystems.ws.client.wrappers.wsdfu.DFUFileTypeWrapper;
public class FileUtility
{
// This value represents the maximum number of splits that will be created during
// the reading process to allow for redistribution of clusters of varying sizes
// IE: A 4GB file part will be redistributable in approximately 32MB blocks.
private static final int DEFAULT_SPLIT_TABLE_SIZE = 128;
private static final int NUM_DEFAULT_THREADS = 4;
private static class TaskContext
{
public AtomicLong recordsRead = new AtomicLong(0);
public AtomicLong recordsWritten = new AtomicLong(0);
public AtomicLong bytesRead = new AtomicLong(0);
public AtomicLong bytesWritten = new AtomicLong(0);
private List errorMessages = new ArrayList();
private List warnMessages = new ArrayList();
private String currentOperationDesc = "";
private long operationStart = 0;
private List operationResults = new ArrayList();
public boolean hasError()
{
boolean err = false;
synchronized(errorMessages)
{
err = errorMessages.size() > 0;
}
return err;
}
public void addError(String error)
{
synchronized(errorMessages)
{
errorMessages.add(error);
}
}
public void addWarn(String warn)
{
synchronized(warnMessages)
{
warnMessages.add(warn);
}
}
public void clear()
{
currentOperationDesc = "";
operationStart = 0;
recordsRead.set(0);
recordsWritten.set(0);
bytesRead.set(0);
bytesWritten.set(0);
errorMessages.clear();
warnMessages.clear();
}
public boolean hasOperation()
{
return !currentOperationDesc.isEmpty();
}
public void startOperation(String operationName)
{
clear();
currentOperationDesc = operationName;
operationStart = System.nanoTime();
}
public void endOperation()
{
endOperation(true);
}
public void endOperation(boolean success)
{
if (!hasOperation())
{
return;
}
long totalOperationTime = System.nanoTime();
totalOperationTime -= operationStart;
double timeInSeconds = (double) totalOperationTime / 1_000_000_000.0;
JSONObject results = new JSONObject();
results.put("operation", currentOperationDesc);
results.put("successful", success);
JSONArray errors = new JSONArray();
for (String err : errorMessages)
{
errors.put(err);
}
results.put("errors", errors);
JSONArray warns = new JSONArray();
for (String warn : warnMessages)
{
warns.put(warn);
}
results.put("warns", warns);
results.put("bytesWritten", bytesWritten.get());
results.put("recordsWritten", recordsWritten.get());
results.put("bytesRead", bytesRead.get());
results.put("recordsRead", recordsRead.get());
results.put("time", String.format("%.2f s",timeInSeconds));
double readBandwidth = (double) bytesRead.get() / (1_000_000.0 * timeInSeconds);
results.put("Read Bandwidth", String.format("%.2f MB/s", readBandwidth));
double writeBandwidth = (double) bytesWritten.get() / (1_000_000.0 * timeInSeconds);
results.put("Write Bandwidth", String.format("%.2f MB/s", writeBandwidth));
operationResults.add(results);
clear();
}
public JSONArray generateResultsMessage()
{
JSONArray results = new JSONArray();
for (JSONObject result : operationResults)
{
results.put(result);
}
return results;
}
};
private static enum FileFormat
{
THOR,
PARQUET
};
private static class SplitEntry
{
public long recordCount = 0;
public long splitStart = 0;
public long splitEnd = 0;
public JSONObject toJson()
{
JSONObject res = new JSONObject();
res.put("recordCount", recordCount);
res.put("splitStart", splitStart);
res.put("splitEnd", splitEnd);
return res;
}
public static SplitEntry fromJson(JSONObject json) throws IOException
{
SplitEntry split = new SplitEntry();
split.recordCount = json.getLong("recordCount");
split.splitStart = json.getLong("splitStart");
split.splitEnd = json.getLong("splitEnd");
return split;
}
}
private static class SplitTable
{
public List splits = new ArrayList();
private long splitStride = 1;
private int maxSplitEntries = DEFAULT_SPLIT_TABLE_SIZE;
private SplitEntry currentSplit = new SplitEntry();
public SplitTable(int maxSplits)
{
maxSplitEntries = maxSplits;
if (maxSplitEntries % 2 == 1)
{
maxSplitEntries++;
}
}
public void addRecordPosition(long fileOffset)
{
if (currentSplit.recordCount == splitStride)
{
currentSplit.splitEnd = fileOffset;
splits.add(currentSplit);
currentSplit = new SplitEntry();
currentSplit.splitStart = fileOffset;
}
if (splits.size() == maxSplitEntries)
{
compactSplitTable();
}
currentSplit.recordCount++;
}
public void finish(long fileSize)
{
currentSplit.splitEnd = fileSize;
splits.add(currentSplit);
}
private void compactSplitTable()
{
splitStride *= 2;
List newSplits = new ArrayList();
for (int i = 0; i < splits.size(); i+=2)
{
SplitEntry first = splits.get(i);
SplitEntry second = splits.get(i+1);
SplitEntry combined = new SplitEntry();
combined.splitStart = first.splitStart;
combined.splitEnd = second.splitEnd;
combined.recordCount = first.recordCount + second.recordCount;
newSplits.add(combined);
}
splits = newSplits;
}
public JSONObject toJson()
{
JSONObject res = new JSONObject();
res.put("splitStride", splitStride);
res.put("maxSplitEntries", maxSplitEntries);
JSONArray splitsJson = new JSONArray();
for (int i = 0; i < splits.size(); i++)
{
splitsJson.put(splits.get(i).toJson());
}
res.put("splits", splitsJson);
return res;
}
public static SplitTable fromJson(JSONObject json) throws IOException
{
int maxSplits = json.getInt("maxSplitEntries");
SplitTable table = new SplitTable(maxSplits);
table.splitStride = json.getLong("splitStride");
JSONArray splitsJson = json.getJSONArray("splits");
if (splitsJson != null)
{
for (int i = 0; i < splitsJson.length(); i++)
{
table.splits.add(SplitEntry.fromJson(splitsJson.getJSONObject(i)));
}
}
return table;
}
}
private static class SplitFile
{
private List splitTables = new ArrayList();
public SplitFile()
{
}
public SplitFile(SplitTable[] tables)
{
splitTables.addAll(Arrays.asList(tables));
}
public SplitTable[] getSplitTableArray()
{
return splitTables.toArray(new SplitTable[0]);
}
public void load(FileInputStream inStream) throws IOException
{
long fileSize = inStream.getChannel().size();
if (fileSize > Integer.MAX_VALUE)
{
throw new IOException("Error: Input file is too large to load.");
}
byte[] byteData = new byte[(int) fileSize];
inStream.read(byteData);
String jsonStr = new String(byteData, StandardCharsets.UTF_8);
JSONObject data = new JSONObject(jsonStr);
int version = data.getInt("version");
if (version != 0)
{
throw new IOException("Error: Unsupported file format version: " + version + ", halting file load.");
}
JSONArray jsonSplitTables = data.getJSONArray("tables");
if (jsonSplitTables != null)
{
for (int i = 0; i < jsonSplitTables.length(); i++)
{
splitTables.add(SplitTable.fromJson(jsonSplitTables.getJSONObject(i)));
}
}
}
public void save(OutputStream outStream) throws IOException
{
JSONObject data = new JSONObject();
JSONArray splitTablesJson = new JSONArray();
for (int i = 0; i < splitTables.size(); i++)
{
splitTablesJson.put(splitTables.get(i).toJson());
}
data.put("version", 0);
data.put("tables", splitTablesJson);
byte[] byteData = data.toString().getBytes(StandardCharsets.UTF_8);
outStream.write(byteData);
}
}
private static Options getReadOptions()
{
Options options = new Options();
options.addRequiredOption("url", "Source Cluster URL", true, "Specifies the URL of the ESP to connect to.");
options.addOption("user", true, "Specifies the username used to connect. Defaults to null.");
options.addOption("pass", true, "Specifies the password used to connect. Defaults to null.");
options.addOption("format", true, "Specifies the output format to be used when writing files to disk. Defaults to Thor files.");
options.addOption("num_threads", true, "Specifies the number of parallel to use to perform operations.");
options.addOption("out", true, "Specifies the directory that the files should be written to.");
options.addOption(Option.builder("read")
.argName("files")
.hasArgs()
.valueSeparator(',')
.desc("Reads the specified file(s) and writes a copy of the files to the local directory")
.required(true)
.build());
return options;
}
private static Options getCopyOptions()
{
Options options = new Options();
options.addRequiredOption("url", "Source Cluster URL", true, "Specifies the URL of the ESP to read from / write to.");
options.addOption("user", true, "Specifies the username used to connect. Defaults to null.");
options.addOption("pass", true, "Specifies the password used to connect. Defaults to null.");
options.addRequiredOption("dest_cluster", "Destination Cluster Name", true, "Specifies the name of the cluster to write files back to.");
options.addOption("dest_url", "Destination Cluster URL", true, "Specifies the URL of the ESP to write to.");
options.addOption("num_threads", true, "Specifies the number of parallel to use to perform operations.");
options.addOption(Option.builder("copy")
.argName("files")
.hasArgs()
.valueSeparator(' ')
.desc("Copies the specified remote source file to the specified remote destination cluster / file.")
.required(true)
.build());
return options;
}
private static Options getWriteOptions()
{
Options options = new Options();
options.addRequiredOption("url", "Source Cluster URL", true, "Specifies the URL of the ESP to read from / write to.");
options.addOption("user", true, "Specifies the username used to connect. Defaults to null.");
options.addOption("pass", true, "Specifies the password used to connect. Defaults to null.");
options.addOption("dest_url", "Destination Cluster URL", true, "Specifies the URL of the ESP to write to.");
options.addRequiredOption("dest_cluster", "Destination Cluster Name", true, "Specifies the name of the cluster to write files back to.");
options.addOption("num_threads", true, "Specifies the number of parallel to use to perform operations.");
options.addOption(Option.builder("write")
.argName("files")
.hasArgs()
.valueSeparator(' ')
.desc("Write the specified local files to the specified remote destination cluster / file.")
.required(true)
.build());
return options;
}
private static Options getTopLevelOptions()
{
Options options = new Options();
options.addOption("read", "Reads the specified file(s) and writes a copy of the files to the local directory.");
options.addOption("copy", "Copies the specified remote source file to the specified remote destination cluster / file.");
options.addOption("write", "Writes the specified local source file to the specified remote destination cluster / file.");
return options;
}
public static String[] findFilesMatching(String filePath) throws Exception
{
boolean isWildcard = filePath.endsWith("*");
if (!isWildcard)
{
File file = new File(filePath);
if (!file.exists())
{
throw new Exception("File path is invalid: " + filePath);
}
String[] res = {filePath};
return res;
}
int indexOfSep = filePath.lastIndexOf(File.separator)+1;
String dirStr = filePath.substring(0,indexOfSep);
String filePattern = filePath.substring(indexOfSep,filePath.length()-1);
File dir = new File(dirStr);
if (!dir.isDirectory() || !dir.exists())
{
throw new Exception("File path is invalid: " + filePath);
}
List result = new ArrayList();
for(File file : dir.listFiles())
{
String name = file.getName();
boolean startsWithPattern = name.startsWith(filePattern);
if (startsWithPattern)
{
result.add(file.getAbsolutePath());
}
}
return result.toArray(new String[0]);
}
private static FileFormat getFormat(String[] srcFiles) throws Exception
{
return FileFormat.THOR;
}
private static String getFormatExtension(FileFormat format)
{
return "";
}
private static FieldDef getRecordDefinition(String[] srcFiles, FileFormat format) throws Exception
{
switch (format)
{
case THOR:
{
String metaFile = null;
for (int i = 0; i < srcFiles.length; i++)
{
String file = srcFiles[i].toLowerCase();
if (file.endsWith(".meta"))
{
metaFile = file;
}
}
if (metaFile == null)
{
throw new Exception("Unable to find Thor meta-data file.");
}
byte[] metaData = Files.readAllBytes(Paths.get(metaFile));
String metaStr = new String(metaData, Charset.defaultCharset());
JSONObject metaJson = new JSONObject(metaStr);
return RecordDefinitionTranslator.parseJsonRecordDefinition(metaJson);
}
case PARQUET:
default:
throw new Exception("File format: " + format + " is not currently supported");
}
}
private static SplitTable[] getSplitTables(String[] srcFiles, FileFormat format) throws Exception
{
if (format != FileFormat.THOR)
{
return null;
}
String splitFile = null;
for (int i = 0; i < srcFiles.length; i++)
{
String file = srcFiles[i].toLowerCase();
if (file.endsWith(".split"))
{
splitFile = file;
break;
}
}
if (splitFile == null)
{
return null;
}
FileInputStream inStream = new FileInputStream(splitFile);
SplitFile file = new SplitFile();
file.load(inStream);
inStream.close();
return file.getSplitTableArray();
}
private static String[] filterFilesByFormat(String[] srcFiles, FileFormat format) throws Exception
{
Pattern pattern = null;
switch (format)
{
case THOR:
{
pattern = Pattern.compile("^[^\\.]*\\._[0-9]+_of_[0-9]+");
break;
}
case PARQUET:
default:
throw new Exception("File format: " + format + " is not currently supported");
}
List filteredFiles = new ArrayList();
for (int i = 0; i < srcFiles.length; i++)
{
int indexOfSep = srcFiles[i].lastIndexOf(File.separator)+1;
String fileName = srcFiles[i].substring(indexOfSep);
if (pattern.matcher(fileName).matches())
{
filteredFiles.add(srcFiles[i]);
}
}
return filteredFiles.toArray(new String[0]);
}
private static void executeTasks(Runnable[] tasks, int numThreads) throws Exception
{
if (tasks.length > numThreads)
{
numThreads = tasks.length;
}
int numTasksPerThread = tasks.length / numThreads;
int numResidualTasks = tasks.length % numThreads;
int taskNum = 0;
Thread[] taskThreads = new Thread[numThreads];
for (int threadNum = 0; threadNum < numThreads; threadNum++)
{
int residualTasks = 0;
if (threadNum < numResidualTasks)
{
residualTasks = 1;
}
final int currentTaskStart = taskNum;
final int currentNumTasks = numTasksPerThread + residualTasks;
taskThreads[threadNum] = new Thread(new Runnable()
{
Runnable[] subTasks = tasks;
int startingSubTask = currentTaskStart;
int numSubTasks = currentNumTasks;
public void run()
{
for (int j = 0; j < numSubTasks; j++)
{
subTasks[startingSubTask + j].run();
}
}
});
taskNum += currentNumTasks;
taskThreads[threadNum].start();
}
for (int threadNum = 0; threadNum < numThreads; threadNum++)
{
taskThreads[threadNum].join();
}
}
private static Runnable[] createReadToThorTasks(DataPartition[] fileParts, SplitTable[] splitTables, String[] outFilePaths, FieldDef recordDef, TaskContext context) throws Exception
{
Runnable[] tasks = new Runnable[fileParts.length];
for (int i = 0; i < tasks.length; i++)
{
final int taskIndex = i;
final HpccRemoteFileReader filePartReader = new HpccRemoteFileReader(fileParts[taskIndex], recordDef, new HPCCRecordBuilder(recordDef));
final String filePath = outFilePaths[taskIndex];
final FileOutputStream outStream = new FileOutputStream(filePath);
final BinaryRecordWriter filePartWriter = new BinaryRecordWriter(outStream);
filePartWriter.initialize(new HPCCRecordAccessor(recordDef));
tasks[taskIndex] = new Runnable()
{
HpccRemoteFileReader fileReader = filePartReader;
BinaryRecordWriter fileWriter = filePartWriter;
FileOutputStream outputStream = outStream;
SplitTable splitTable = splitTables[taskIndex];
public void run()
{
try
{
while (fileReader.hasNext())
{
splitTable.addRecordPosition(fileReader.getStreamPosition());
HPCCRecord record = fileReader.next();
fileWriter.writeRecord(record);
context.recordsRead.incrementAndGet();
}
splitTable.finish(fileReader.getStreamPosition());
fileReader.close();
context.bytesRead.addAndGet(fileReader.getStreamPosition());
fileWriter.finalize();
outputStream.close();
}
catch (Exception e)
{
context.addError("Error while reading file: '" + filePath + "'," + taskIndex + ": " + e.getMessage());
return;
}
}
};
}
return tasks;
}
private static Runnable[] createThorSplitTableTasks(String[] thorFiles, SplitTable[] splitTables, FieldDef recordDef, TaskContext context) throws Exception
{
Runnable[] tasks = new Runnable[thorFiles.length];
for (int i = 0; i < tasks.length; i++)
{
final int taskIndex = i;
final SplitTable splitTable = new SplitTable(DEFAULT_SPLIT_TABLE_SIZE);
splitTables[taskIndex] = splitTable;
BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(thorFiles[taskIndex]));
BinaryRecordReader filePartReader = new BinaryRecordReader(bufferedInputStream);
filePartReader.initialize(new HPCCRecordBuilder(recordDef));
tasks[taskIndex] = new Runnable()
{
InputStream inputStream = bufferedInputStream;
BinaryRecordReader fileReader = filePartReader;
public void run()
{
try
{
while (fileReader.hasNext())
{
splitTable.addRecordPosition(fileReader.getStreamPosAfterLastRecord());
HPCCRecord record = (HPCCRecord) fileReader.getNext();
}
splitTable.finish(fileReader.getStreamPosAfterLastRecord());
inputStream.close();
}
catch (Exception e)
{
context.addError("Error while writing file taskIndex: " + taskIndex + " - " + e.getMessage());
return;
}
}
};
}
return tasks;
}
private static Runnable[] createNonRedistributingCopyTasks(HPCCFile file, DFUCreateFileWrapper createResult, TaskContext context) throws Exception
{
FieldDef recordDef = null;
DataPartition[] inFileParts = null;
DataPartition[] outFileParts = null;
inFileParts = file.getFileParts();
recordDef = file.getRecordDefinition();
DFUFilePartWrapper[] dfuFileParts = createResult.getFileParts();
NullRemapper remapper = new NullRemapper(new RemapInfo(), createResult.getFileAccessInfo());
outFileParts = DataPartition.createPartitions(dfuFileParts, remapper, dfuFileParts.length, createResult.getFileAccessInfoBlob());
int incomingPerOutgoing = inFileParts.length / outFileParts.length;
int residualIncomingFileParts = inFileParts.length % outFileParts.length;
int incomingFilePartIndex = 0;
Runnable[] tasks = new Runnable[outFileParts.length];
for (int i = 0; i < tasks.length; i++)
{
final int taskIndex = i;
DataPartition outFilePart = outFileParts[taskIndex];
final int numIncomingParts = incomingPerOutgoing + ((taskIndex < residualIncomingFileParts) ? 1 : 0);
HpccRemoteFileReader[] filePartReaders = new HpccRemoteFileReader[numIncomingParts];
for (int j = 0; j < numIncomingParts; j++)
{
DataPartition inFilePart = inFileParts[incomingFilePartIndex + j];
filePartReaders[j] = new HpccRemoteFileReader(inFilePart, recordDef, new HPCCRecordBuilder(file.getProjectedRecordDefinition()));
}
incomingFilePartIndex += numIncomingParts;
HPCCRecordAccessor recordAccessor = new HPCCRecordAccessor(recordDef);
final HPCCRemoteFileWriter partFileWriter = new HPCCRemoteFileWriter(outFilePart, recordDef, recordAccessor, CompressionAlgorithm.NONE);
tasks[taskIndex] = new Runnable()
{
HpccRemoteFileReader[] fileReaders = filePartReaders;
HPCCRemoteFileWriter fileWriter = partFileWriter;
public void run()
{
try
{
for (int k = 0; k < fileReaders.length; k++)
{
HpccRemoteFileReader fileReader = fileReaders[k];
while (fileReader.hasNext())
{
HPCCRecord record = fileReader.next();
fileWriter.writeRecord(record);
context.recordsWritten.incrementAndGet();
context.recordsRead.incrementAndGet();
}
fileReader.close();
context.bytesRead.addAndGet(fileReader.getStreamPosition());
}
System.out.println("Closing file writer for task: " + taskIndex);
fileWriter.close();
context.bytesWritten.addAndGet(fileWriter.getBytesWritten());
}
catch (Exception e)
{
context.addError("Error while copying file: '" + file.getFileName() + "'," + taskIndex + ": " + e.getMessage());
return;
}
}
};
}
return tasks;
}
/*
* Redistribution notes:
* Download file locally and build split table, or build split table if one does not exist.
* Create write with redistribution using the split table
*/
private static class SplitEntryMapping
{
int startingSrcFile = 0;
int splitEntryStart = 0;
int endingSrcFile = 0;
int splitEntryEnd = 0;
}
private static Runnable[] createWriteTasks(String[] srcFiles, SplitTable[] splitTables, FieldDef recordDef, FileFormat format, DFUCreateFileWrapper createResult, TaskContext context) throws Exception
{
DataPartition[] outFileParts = null;
DFUFilePartWrapper[] dfuFileParts = createResult.getFileParts();
NullRemapper remapper = new NullRemapper(new RemapInfo(), createResult.getFileAccessInfo());
outFileParts = DataPartition.createPartitions(dfuFileParts, remapper, dfuFileParts.length, createResult.getFileAccessInfoBlob());
// Determine mapping from split entries to output file parts
SplitEntryMapping[] srcFileToOutPartsMapping = new SplitEntryMapping[outFileParts.length];
if (srcFiles.length != outFileParts.length)
{
int totalSplitEntries = 0;
for (int i = 0; i < splitTables.length; i++)
{
totalSplitEntries += splitTables[i].splits.size();
}
int splitsPerOutFile = totalSplitEntries / outFileParts.length;
int residualSplits = totalSplitEntries % outFileParts.length;
int currentSrcFile = 0;
int currentSrcFileSplitStart = 0;
int currentSrcFileSplitEnd = splitTables[0].splits.size();
int splitStart = 0;
for (int i = 0; i < srcFileToOutPartsMapping.length; i++)
{
int numSplits = splitsPerOutFile + ((i < residualSplits ) ? 1 : 0);
SplitEntryMapping mapping = new SplitEntryMapping();
mapping.startingSrcFile = currentSrcFile;
mapping.splitEntryStart = splitStart - currentSrcFileSplitStart;
int splitEnd = splitStart + numSplits;
while (currentSrcFileSplitEnd < splitEnd)
{
currentSrcFile++;
currentSrcFileSplitStart = currentSrcFileSplitEnd;
currentSrcFileSplitEnd += splitTables[currentSrcFile].splits.size();
}
mapping.endingSrcFile = currentSrcFile;
mapping.splitEntryEnd = splitEnd - currentSrcFileSplitStart;
srcFileToOutPartsMapping[i] = mapping;
splitStart = splitEnd;
}
}
else
{
for (int i = 0; i < srcFileToOutPartsMapping.length; i++)
{
SplitEntryMapping mapping = new SplitEntryMapping();
mapping.startingSrcFile = i;
mapping.splitEntryStart = 0;
mapping.endingSrcFile = i;
mapping.splitEntryEnd = splitTables[i].splits.size();
srcFileToOutPartsMapping[i] = mapping;
}
}
Runnable[] tasks = new Runnable[outFileParts.length];
for (int i = 0; i < tasks.length; i++)
{
final int taskIndex = i;
DataPartition outFilePart = outFileParts[taskIndex];
HPCCRecordAccessor recordAccessor = new HPCCRecordAccessor(recordDef);
HPCCRemoteFileWriter filePartWriter = new HPCCRemoteFileWriter(outFilePart, recordDef, recordAccessor, CompressionAlgorithm.NONE);
tasks[taskIndex] = new Runnable()
{
SplitEntryMapping mapping = srcFileToOutPartsMapping[taskIndex];
HPCCRemoteFileWriter fileWriter = filePartWriter;
public void run()
{
try
{
int numIncomingParts = (mapping.endingSrcFile+1) - mapping.startingSrcFile;
BinaryRecordReader[] fileReaders = new BinaryRecordReader[numIncomingParts];
BufferedInputStream[] inputStreams = new BufferedInputStream[numIncomingParts];
for (int j = 0; j < numIncomingParts; j++)
{
String srcFile = srcFiles[mapping.startingSrcFile + j];
inputStreams[j] = new BufferedInputStream(new FileInputStream(srcFile));
if (j == 0)
{
SplitEntry startingSplit = splitTables[mapping.startingSrcFile].splits.get(mapping.splitEntryStart);
fileReaders[j] = new BinaryRecordReader(inputStreams[j], startingSplit.splitStart);
}
else
{
fileReaders[j] = new BinaryRecordReader(inputStreams[j]);
}
fileReaders[j].initialize(new HPCCRecordBuilder(recordDef));
}
for (int j = 0; j < fileReaders.length; j++)
{
BinaryRecordReader fileReader = fileReaders[j];
long splitEnd = Long.MAX_VALUE;
if (j == (fileReaders.length-1))
{
SplitEntry endingSplit = splitTables[mapping.endingSrcFile].splits.get(mapping.splitEntryEnd-1);
splitEnd = endingSplit.splitEnd;
}
while (fileReader.hasNext() && fileReader.getStreamPosAfterLastRecord() < splitEnd)
{
HPCCRecord record = (HPCCRecord) fileReader.getNext();
fileWriter.writeRecord(record);
context.recordsWritten.incrementAndGet();
context.recordsRead.incrementAndGet();
}
context.bytesRead.addAndGet(fileReader.getStreamPosAfterLastRecord());
inputStreams[j].close();
}
fileWriter.close();
context.bytesWritten.addAndGet(fileWriter.getBytesWritten());
}
catch (Exception e)
{
context.addError("Error while writing file taskIndex: " + taskIndex + " - " + e.getMessage());
return;
}
}
};
}
return tasks;
}
private static void performRead(String[] args, TaskContext context)
{
Options options = getReadOptions();
CommandLineParser parser = new DefaultParser();
CommandLine cmd = null;
try
{
cmd = parser.parse(options, args);
}
catch (ParseException e)
{
System.out.println("Error parsing commandline options:\n" + e.getMessage());
return;
}
String connString = cmd.getOptionValue("url");
String user = cmd.getOptionValue("user");
String pass = cmd.getOptionValue("pass");
String outputPath = cmd.getOptionValue("out",".");
int numThreads = NUM_DEFAULT_THREADS;
String numThreadsStr = cmd.getOptionValue("num_threads", "" + numThreads);
try
{
numThreads = Integer.parseInt(numThreadsStr);
}
catch(Exception e)
{
System.out.println("Invalid option value for num_threads: "
+ numThreadsStr + ", must be an integer. Defaulting to: " + NUM_DEFAULT_THREADS + " threads.");
}
String formatStr = cmd.getOptionValue("format");
if (formatStr == null)
{
formatStr = "THOR";
}
FileFormat format = FileFormat.THOR;
switch (formatStr.toUpperCase())
{
case "THOR":
format = FileFormat.THOR;
break;
case "PARQUET":
format = FileFormat.PARQUET;
break;
default:
System.out.println("Error unsupported format specified: " + format);
return;
}
String[] datasets = cmd.getOptionValues("read");
for (int i = 0; i < datasets.length; i++)
{
String datasetName = datasets[i];
context.startOperation("Read " + datasetName);
HPCCFile file = null;
try
{
file = new HPCCFile(datasetName, connString, user, pass);
}
catch (Exception e)
{
System.out.println("Error while attempting to open file: '" + datasetName + "': " + e.getMessage());
return;
}
DataPartition[] fileParts = null;
FieldDef recordDef = null;
try
{
fileParts = file.getFileParts();
recordDef = file.getRecordDefinition();
}
catch (Exception e)
{
System.out.println("Error while retrieving file parts for: '" + datasetName + "': " + e.getMessage());
return;
}
// Figure out the format string needed based on the number of file parts
String lenStr = "" + fileParts.length;
String fileNumFormat = "%0" + lenStr.length() + "d";
String fileExt = getFormatExtension(format);
File outDir = new File(outputPath);
if (!outDir.exists())
{
outDir.mkdirs();
}
SplitTable[] splitTables = new SplitTable[fileParts.length];
String[] outFilePaths = new String[fileParts.length];
for (int j = 0; j < fileParts.length; j++)
{
String fileName = file.getFileName().replace(":","_");
outFilePaths[j] = outputPath + File.separator + fileName + "._" + String.format(fileNumFormat, j+1) + "_of_" + fileParts.length + fileExt;
splitTables[j] = new SplitTable(DEFAULT_SPLIT_TABLE_SIZE);
}
Runnable[] tasks = null;
try
{
switch (format)
{
case THOR:
tasks = createReadToThorTasks(fileParts, splitTables, outFilePaths, recordDef, context);
break;
case PARQUET:
default:
throw new Exception("Error unsupported format specified: " + format);
};
}
catch (Exception e)
{
context.addError("Error while attempting to create read tasks for file: '" + datasetName + "': " + e.getMessage());
return;
}
try
{
executeTasks(tasks, numThreads);
}
catch (Exception e)
{
context.addError("Error while attempting to execute read tasks for file: '" + datasetName + "': " + e.getMessage());
return;
}
if (context.hasError())
{
return;
}
try
{
String fileName = file.getFileName().replace(":","_");
String filePath = outputPath + File.separator + fileName + ".meta";
FileOutputStream metaFile = new FileOutputStream(filePath);
String metaStr = RecordDefinitionTranslator.toJsonRecord(file.getRecordDefinition()).toString();
metaFile.write(metaStr.getBytes());
metaFile.close();
}
catch (Exception e)
{
context.addError("Error while attempting to write meta-data for file: '" + datasetName + "': " + e.getMessage());
return;
}
try
{
String fileName = file.getFileName().replace(":","_");
String filePath = outputPath + File.separator + fileName + ".split";
FileOutputStream splitFileOut = new FileOutputStream(filePath);
SplitFile splitFile = new SplitFile(splitTables);
splitFile.save(splitFileOut);
splitFileOut.close();
}
catch (Exception e)
{
context.addError("Error while attempting to write split table file for dataset: '" + datasetName + "': " + e.getMessage());
return;
}
context.endOperation();
}
}
private static void performCopy(String[] args, TaskContext context)
{
Options options = getCopyOptions();
CommandLineParser parser = new DefaultParser();
CommandLine cmd = null;
try
{
cmd = parser.parse(options, args);
}
catch (ParseException e)
{
System.out.println("Error parsing commandline options:\n" + e.getMessage());
return;
}
int numThreads = NUM_DEFAULT_THREADS;
String numThreadsStr = cmd.getOptionValue("num_threads", "" + numThreads);
try
{
numThreads = Integer.parseInt(numThreadsStr);
}
catch(Exception e)
{
System.out.println("Invalid option value for num_threads: "
+ numThreadsStr + ", must be an integer. Defaulting to: " + NUM_DEFAULT_THREADS + " threads.");
}
String user = cmd.getOptionValue("user");
String pass = cmd.getOptionValue("pass");
String destClusterName = cmd.getOptionValue("dest_cluster");
String srcURL = cmd.getOptionValue("url");
String destURL = cmd.getOptionValue("dest_url");
if (destURL == null)
{
destURL = srcURL;
}
Connection destConn = null;
try
{
destConn = new Connection(destURL);
}
catch (Exception e)
{
System.out.println("Error while attempting to construct connection: " + e.getMessage());
return;
}
destConn.setCredentials(user, pass);
Platform platform = Platform.get(destConn);
HPCCWsClient wsclient = null;
try
{
wsclient = platform.checkOutHPCCWsClient();
}
catch (Exception e)
{
System.out.println("Error while attempting to connect to platform: " + e.getMessage());
return;
}
HPCCWsDFUClient dfuClient = wsclient.getWsDFUClient();
String[] copyPairs = cmd.getOptionValues("copy");
if ((copyPairs.length % 2) != 0)
{
System.out.println("Error copy operation must specify both a source and destination file pairs separated by a space.");
return;
}
for (int i = 0; i < copyPairs.length; i+=2)
{
String srcFile = copyPairs[i];
String destFile = copyPairs[i+1];
context.startOperation("Copy " + srcFile + " -> " + destFile);
HPCCFile file = null;
try
{
file = new HPCCFile(srcFile, srcURL, user, pass);
}
catch (Exception e)
{
context.addError("Error while attempting to open file: '" + srcFile + "': " + e.getMessage());
return;
}
DataPartition[] srcFileParts = null;
try
{
srcFileParts = file.getFileParts();
}
catch (HpccFileException e)
{
context.addError("Error while retrieving file parts for: '" + srcFile + "': " + e.getMessage());
}
boolean shouldRedistribute = true;
if (!shouldRedistribute)
{
int expirySecs = 300;
DFUCreateFileWrapper createResult = null;
String eclRecordDefn = null;
try
{
eclRecordDefn = RecordDefinitionTranslator.toECLRecord(file.getRecordDefinition());
createResult = dfuClient.createFile(destFile, destClusterName, eclRecordDefn,
expirySecs, false, DFUFileTypeWrapper.Flat, "");
}
catch (Exception e)
{
context.addError("Error while attempting to start file creation for: '" + destFile + "': " + e.getMessage());
return;
}
Runnable[] tasks = null;
try
{
tasks = createNonRedistributingCopyTasks(file, createResult, context);
}
catch (Exception e)
{
context.addError("Error while attempting to create copy tasks for file: '" + srcFile + "': " + e.getMessage());
return;
}
try
{
executeTasks(tasks, numThreads);
}
catch (Exception e)
{
context.addError("Error while attempting to execute copy tasks for file: '" + srcFile + "': " + e.getMessage());
return;
}
if (context.hasError())
{
return;
}
try
{
long bytesWritten = context.bytesWritten.get();
long recordsWritten = context.recordsWritten.get();
dfuClient.publishFile(createResult.getFileID(), eclRecordDefn, recordsWritten, bytesWritten, true);
}
catch (Exception e)
{
context.addError("Error while attempting to publish file: '" + destFile + "': " + e.getMessage());
return;
}
}
else
{
String readArgs[] = {"-read", srcFile, "-url", srcURL,
"-format", "thor", "-user", user, "-pass", pass,
"-out", "tmp-read"};
performRead(readArgs, context);
String writeArgs[] = {"-write", "tmp-read" + File.separator + srcFile.replace(':', '_') + "*" + " " + destFile,
"-url", srcURL, "-dest_url", destURL,
"-dest_cluster", destClusterName,
"-user", user, "-pass", pass };
performWrite(writeArgs, context);
}
context.endOperation();
}
}
private static void performWrite(String[] args, TaskContext context)
{
Options options = getWriteOptions();
CommandLineParser parser = new DefaultParser();
CommandLine cmd = null;
try
{
cmd = parser.parse(options, args);
}
catch (ParseException e)
{
System.out.println("Error parsing commandline options:\n" + e.getMessage());
return;
}
int numThreads = NUM_DEFAULT_THREADS;
String numThreadsStr = cmd.getOptionValue("num_threads", "" + numThreads);
try
{
numThreads = Integer.parseInt(numThreadsStr);
}
catch(Exception e)
{
System.out.println("Invalid option value for num_threads: "
+ numThreadsStr + ", must be an integer. Defaulting to: " + NUM_DEFAULT_THREADS + " threads.");
}
String user = cmd.getOptionValue("user");
String pass = cmd.getOptionValue("pass");
String destClusterName = cmd.getOptionValue("dest_cluster");
String srcURL = cmd.getOptionValue("url");
String destURL = cmd.getOptionValue("dest_url");
if (destURL == null)
{
destURL = srcURL;
}
Connection destConn = null;
try
{
destConn = new Connection(destURL);
}
catch (Exception e)
{
System.out.println("Error while attempting to construct connection: " + e.getMessage());
return;
}
destConn.setCredentials(user, pass);
Platform platform = Platform.get(destConn);
HPCCWsClient wsclient = null;
try
{
wsclient = platform.checkOutHPCCWsClient();
}
catch (Exception e)
{
System.out.println("Error while attempting to connect to platform: " + e.getMessage());
return;
}
HPCCWsDFUClient dfuClient = wsclient.getWsDFUClient();
String[] writePairs = cmd.getOptionValues("write");
if ((writePairs.length % 2) != 0)
{
System.out.println("Error write operation must specify both a source and destination file pairs separated by a space.");
return;
}
for (int pairIdx = 0; pairIdx < writePairs.length; pairIdx += 2)
{
String srcFile = writePairs[pairIdx];
String destFile = writePairs[pairIdx+1];
context.startOperation("Write " + srcFile + " -> " + destFile);
SplitTable[] splitTables = null;
String[] srcFiles = null;
FileFormat format = FileFormat.THOR;
FieldDef recordDefinition = null;
try
{
srcFiles = findFilesMatching(srcFile);
format = getFormat(srcFiles);
recordDefinition = getRecordDefinition(srcFiles, format);
splitTables = getSplitTables(srcFiles, format);
srcFiles = filterFilesByFormat(srcFiles, format);
}
catch (Exception e)
{
context.addError("Error while constructing source file list: " + e.getMessage());
return;
}
Arrays.sort(srcFiles);
if (srcFiles.length == 0)
{
context.addError("Error no files matching: " + srcFile);
return;
}
boolean needToCreateSplitTable = (splitTables == null || splitTables.length == 0) && format == FileFormat.THOR;
if (needToCreateSplitTable)
{
Runnable[] tasks = null;
try
{
splitTables = new SplitTable[srcFiles.length];
tasks = createThorSplitTableTasks(srcFiles, splitTables, recordDefinition, context);
}
catch (Exception e)
{
context.addError("Error while attempting to create split table creation tasks for file: '" + srcFile + "': " + e.getMessage());
return;
}
try
{
executeTasks(tasks, numThreads);
}
catch (Exception e)
{
context.addError("Error while attempting to execute create split table creation tasks for file: '" + srcFile + "': " + e.getMessage());
return;
}
}
if (needToCreateSplitTable)
{
try
{
String fileName = srcFiles[0].substring(0,srcFiles[0].lastIndexOf('.'));
String filePath = fileName + ".split";
FileOutputStream splitFileOut = new FileOutputStream(filePath);
SplitFile splitFile = new SplitFile(splitTables);
splitFile.save(splitFileOut);
splitFileOut.close();
}
catch (Exception e)
{
context.addError("Error while attempting to write split table file for dataset: '" + srcFile + "': " + e.getMessage());
return;
}
}
int expirySecs = 300;
DFUCreateFileWrapper createResult = null;
String eclRecordDefn = null;
try
{
eclRecordDefn = RecordDefinitionTranslator.toECLRecord(recordDefinition);
createResult = dfuClient.createFile(destFile, destClusterName, eclRecordDefn,
expirySecs, false, DFUFileTypeWrapper.Flat, "");
}
catch (Exception e)
{
context.addError("Error while attempting to start file creation for: '" + destFile + "': " + e.getMessage());
return;
}
Runnable[] tasks = null;
try
{
tasks = createWriteTasks(srcFiles, splitTables, recordDefinition, format, createResult, context);
}
catch (Exception e)
{
context.addError("Error while attempting to create write tasks for file: '" + srcFile + "': " + e.getMessage());
return;
}
try
{
executeTasks(tasks, numThreads);
}
catch (Exception e)
{
context.addError("Error while attempting to execute write tasks for file: '" + srcFile + "': " + e.getMessage());
return;
}
if (context.hasError())
{
return;
}
try
{
long bytesWritten = context.bytesWritten.get();
long recordsWritten = context.recordsWritten.get();
dfuClient.publishFile(createResult.getFileID(), eclRecordDefn, recordsWritten, bytesWritten, true);
}
catch (Exception e)
{
context.addError("Error while attempting to publish file: '" + destFile + "': " + e.getMessage());
return;
}
context.endOperation();
}
}
/**
* Executes an operation based on the provided args.
* @param args Operation args
* @return JSONArray
*/
public static JSONArray run(String[] args)
{
Options options = getTopLevelOptions();
CommandLineParser parser = new DefaultParser();
CommandLine cmd = null;
try
{
boolean stopAtNonOption = false;
// Stop at non-option doesn't seem to work 1.5, so we are only taking the first arg to prevent unknown option exception
String[] truncatedArgs = new String[1];
truncatedArgs[0] = args[0];
cmd = parser.parse(options, truncatedArgs, stopAtNonOption);
}
catch (ParseException e)
{
System.out.println("Error parsing commandline options:\n" + e.getMessage());
return new JSONArray();
}
TaskContext context = new TaskContext();
if (cmd.hasOption("read"))
{
performRead(args, context);
}
else if (cmd.hasOption("copy"))
{
performCopy(args, context);
}
else if (cmd.hasOption("write"))
{
performWrite(args, context);
}
// If we are still in the middle of an operation there was a failure
if (context.hasOperation())
{
boolean succeded = false;
context.endOperation(succeded);
}
return context.generateResultsMessage();
}
public static void main(String[] args)
{
JSONArray results = run(args);
System.out.println("Results:\n--------------------------------------------------\n");
System.out.println(results.toString(2));
return;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy