All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.hpccsystems.dfs.client.FileUtility Maven / Gradle / Ivy

There is a newer version: 9.8.46-1
Show newest version
/*******************************************************************************
 *     HPCC SYSTEMS software Copyright (C) 2023 HPCC Systems®.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *******************************************************************************/

package org.hpccsystems.dfs.client;

import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.Arrays;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;

import org.hpccsystems.commons.ecl.FieldDef;
import org.json.JSONArray;
import org.json.JSONObject;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

import org.hpccsystems.ws.client.HPCCWsClient;
import org.hpccsystems.ws.client.platform.Platform;
import org.hpccsystems.ws.client.utils.Connection;

import org.hpccsystems.dfs.cluster.*;
import org.hpccsystems.commons.ecl.RecordDefinitionTranslator;
import org.hpccsystems.commons.errors.HpccFileException;
import org.hpccsystems.ws.client.HPCCWsDFUClient;
import org.hpccsystems.ws.client.wrappers.wsdfu.DFUCreateFileWrapper;
import org.hpccsystems.ws.client.wrappers.wsdfu.DFUFilePartWrapper;
import org.hpccsystems.ws.client.wrappers.wsdfu.DFUFileTypeWrapper;

public class FileUtility
{
    // This value represents the maximum number of splits that will be created during
    // the reading process to allow for redistribution of clusters of varying sizes
    // IE: A 4GB file part will be redistributable in approximately 32MB blocks.
    private static final int DEFAULT_SPLIT_TABLE_SIZE = 128;

    private static final int NUM_DEFAULT_THREADS = 4;

    private static class TaskContext
    {
        public AtomicLong recordsRead = new AtomicLong(0);
        public AtomicLong recordsWritten = new AtomicLong(0);

        public AtomicLong bytesRead = new AtomicLong(0);
        public AtomicLong bytesWritten = new AtomicLong(0);

        private List errorMessages = new ArrayList();
        private List warnMessages = new ArrayList();

        private String currentOperationDesc = "";
        private long operationStart = 0;
        private List operationResults = new ArrayList();

        public boolean hasError()
        {
            boolean err = false;
            synchronized(errorMessages)
            {
                err = errorMessages.size() > 0;
            }

            return err;
        }

        public void addError(String error)
        {
            synchronized(errorMessages)
            {
                errorMessages.add(error);
            }
        }

        public void addWarn(String warn)
        {
            synchronized(warnMessages)
            {
                warnMessages.add(warn);
            }
        }

        public void clear()
        {
            currentOperationDesc = "";
            operationStart = 0;
            recordsRead.set(0);
            recordsWritten.set(0);

            bytesRead.set(0);
            bytesWritten.set(0);

            errorMessages.clear();
            warnMessages.clear();
        }

        public boolean hasOperation()
        {
            return !currentOperationDesc.isEmpty();
        }

        public void startOperation(String operationName)
        {
            clear();
            currentOperationDesc = operationName;
            operationStart = System.nanoTime();
        }

        public void endOperation()
        {
            endOperation(true);
        }

        public void endOperation(boolean success)
        {
            if (!hasOperation())
            {
                return;
            }

            long totalOperationTime = System.nanoTime();
            totalOperationTime -= operationStart;

            double timeInSeconds = (double) totalOperationTime / 1_000_000_000.0;

            JSONObject results = new JSONObject();

            results.put("operation", currentOperationDesc);
            results.put("successful", success);

            JSONArray errors = new JSONArray();
            for (String err : errorMessages)
            {
                errors.put(err);
            }
            results.put("errors", errors);

            JSONArray warns = new JSONArray();
            for (String warn : warnMessages)
            {
                warns.put(warn);
            }
            results.put("warns", warns);

            results.put("bytesWritten", bytesWritten.get());
            results.put("recordsWritten", recordsWritten.get());

            results.put("bytesRead", bytesRead.get());
            results.put("recordsRead", recordsRead.get());

            results.put("time", String.format("%.2f s",timeInSeconds));

            double readBandwidth = (double) bytesRead.get() / (1_000_000.0 * timeInSeconds);
            results.put("Read Bandwidth", String.format("%.2f MB/s", readBandwidth));

            double writeBandwidth = (double) bytesWritten.get() / (1_000_000.0 * timeInSeconds);
            results.put("Write Bandwidth", String.format("%.2f MB/s", writeBandwidth));

            operationResults.add(results);

            clear();
        }

        public JSONArray generateResultsMessage()
        {
            JSONArray results = new JSONArray();
            for (JSONObject result : operationResults)
            {
                results.put(result);
            }

            return results;
        }
    };

    private static enum FileFormat
    {
        THOR,
        PARQUET
    };

    private static class SplitEntry
    {
        public long recordCount = 0;
        public long splitStart = 0;
        public long splitEnd = 0;

        public JSONObject toJson()
        {
            JSONObject res = new JSONObject();
            res.put("recordCount", recordCount);
            res.put("splitStart", splitStart);
            res.put("splitEnd", splitEnd);

            return res;
        }

        public static SplitEntry fromJson(JSONObject json) throws IOException
        {
            SplitEntry split = new SplitEntry();
            split.recordCount = json.getLong("recordCount");
            split.splitStart = json.getLong("splitStart");
            split.splitEnd = json.getLong("splitEnd");

            return split;
        }
    }

    private static class SplitTable
    {
        public List splits = new ArrayList();
        private long splitStride = 1;
        private int maxSplitEntries = DEFAULT_SPLIT_TABLE_SIZE;
        private SplitEntry currentSplit = new SplitEntry();

        public SplitTable(int maxSplits)
        {
            maxSplitEntries = maxSplits;
            if (maxSplitEntries % 2 == 1)
            {
                maxSplitEntries++;
            }
        }

        public void addRecordPosition(long fileOffset)
        {
            if (currentSplit.recordCount == splitStride)
            {
                currentSplit.splitEnd = fileOffset;
                splits.add(currentSplit);

                currentSplit = new SplitEntry();
                currentSplit.splitStart = fileOffset;
            }

            if (splits.size() == maxSplitEntries)
            {
                compactSplitTable();
            }

            currentSplit.recordCount++;
        }

        public void finish(long fileSize)
        {
            currentSplit.splitEnd = fileSize;
            splits.add(currentSplit);
        }

        private void compactSplitTable()
        {
            splitStride *= 2;

            List newSplits = new ArrayList();
            for (int i = 0; i < splits.size(); i+=2)
            {
                SplitEntry first = splits.get(i);
                SplitEntry second = splits.get(i+1);

                SplitEntry combined = new SplitEntry();
                combined.splitStart = first.splitStart;
                combined.splitEnd = second.splitEnd;
                combined.recordCount = first.recordCount + second.recordCount;

                newSplits.add(combined);
            }

            splits = newSplits;
        }

        public JSONObject toJson()
        {
            JSONObject res = new JSONObject();
            res.put("splitStride", splitStride);
            res.put("maxSplitEntries", maxSplitEntries);

            JSONArray splitsJson = new JSONArray();
            for (int i = 0; i < splits.size(); i++)
            {
                splitsJson.put(splits.get(i).toJson());
            }

            res.put("splits", splitsJson);
            return res;
        }

        public static SplitTable fromJson(JSONObject json) throws IOException
        {
            int maxSplits = json.getInt("maxSplitEntries");
            SplitTable table = new SplitTable(maxSplits);

            table.splitStride = json.getLong("splitStride");
            JSONArray splitsJson = json.getJSONArray("splits");
            if (splitsJson != null)
            {
                for (int i = 0; i < splitsJson.length(); i++)
                {
                    table.splits.add(SplitEntry.fromJson(splitsJson.getJSONObject(i)));
                }
            }

            return table;
        }
    }

    private static class SplitFile
    {
        private List splitTables = new ArrayList();

        public SplitFile()
        {
        }

        public SplitFile(SplitTable[] tables)
        {
            splitTables.addAll(Arrays.asList(tables));
        }

        public SplitTable[] getSplitTableArray()
        {
            return splitTables.toArray(new SplitTable[0]);
        }

        public void load(FileInputStream inStream) throws IOException
        {
            long fileSize = inStream.getChannel().size();
            if (fileSize > Integer.MAX_VALUE)
            {
                throw new IOException("Error: Input file is too large to load.");
            }

            byte[] byteData = new byte[(int) fileSize];
            inStream.read(byteData);

            String jsonStr = new String(byteData, StandardCharsets.UTF_8);
            JSONObject data = new JSONObject(jsonStr);

            int version = data.getInt("version");
            if (version != 0)
            {
                throw new IOException("Error: Unsupported file format version: " + version + ", halting file load.");
            }

            JSONArray jsonSplitTables = data.getJSONArray("tables");
            if (jsonSplitTables != null)
            {
                for (int i = 0; i < jsonSplitTables.length(); i++)
                {
                    splitTables.add(SplitTable.fromJson(jsonSplitTables.getJSONObject(i)));
                }
            }
        }

        public void save(OutputStream outStream) throws IOException
        {
            JSONObject data = new JSONObject();

            JSONArray splitTablesJson = new JSONArray();
            for (int i = 0; i < splitTables.size(); i++)
            {
                splitTablesJson.put(splitTables.get(i).toJson());
            }

            data.put("version", 0);
            data.put("tables", splitTablesJson);

            byte[] byteData = data.toString().getBytes(StandardCharsets.UTF_8);

            outStream.write(byteData);
        }
    }

    private static Options getReadOptions()
    {
        Options options = new Options();
        options.addRequiredOption("url", "Source Cluster URL", true, "Specifies the URL of the ESP to connect to.");
        options.addOption("user", true, "Specifies the username used to connect. Defaults to null.");
        options.addOption("pass", true, "Specifies the password used to connect. Defaults to null.");
        options.addOption("format", true, "Specifies the output format to be used when writing files to disk. Defaults to Thor files.");
        options.addOption("num_threads", true, "Specifies the number of parallel to use to perform operations.");
        options.addOption("out", true, "Specifies the directory that the files should be written to.");

        options.addOption(Option.builder("read")
                                .argName("files")
                                .hasArgs()
                                .valueSeparator(',')
                                .desc("Reads the specified file(s) and writes a copy of the files to the local directory")
                                .required(true)
                                .build());
        return options;
    }

    private static Options getCopyOptions()
    {
        Options options = new Options();
        options.addRequiredOption("url", "Source Cluster URL", true, "Specifies the URL of the ESP to read from / write to.");
        options.addOption("user", true, "Specifies the username used to connect. Defaults to null.");
        options.addOption("pass", true, "Specifies the password used to connect. Defaults to null.");
        options.addRequiredOption("dest_cluster", "Destination Cluster Name", true, "Specifies the name of the cluster to write files back to.");
        options.addOption("dest_url", "Destination Cluster URL", true, "Specifies the URL of the ESP to write to.");
        options.addOption("num_threads", true, "Specifies the number of parallel to use to perform operations.");

        options.addOption(Option.builder("copy")
                                .argName("files")
                                .hasArgs()
                                .valueSeparator(' ')
                                .desc("Copies the specified remote source file to the specified remote destination cluster / file.")
                                .required(true)
                                .build());

        return options;
    }

    private static Options getWriteOptions()
    {
        Options options = new Options();
        options.addRequiredOption("url", "Source Cluster URL", true, "Specifies the URL of the ESP to read from / write to.");
        options.addOption("user", true, "Specifies the username used to connect. Defaults to null.");
        options.addOption("pass", true, "Specifies the password used to connect. Defaults to null.");
        options.addOption("dest_url", "Destination Cluster URL", true, "Specifies the URL of the ESP to write to.");
        options.addRequiredOption("dest_cluster", "Destination Cluster Name", true, "Specifies the name of the cluster to write files back to.");
        options.addOption("num_threads", true, "Specifies the number of parallel to use to perform operations.");

        options.addOption(Option.builder("write")
                                .argName("files")
                                .hasArgs()
                                .valueSeparator(' ')
                                .desc("Write the specified local files to the specified remote destination cluster / file.")
                                .required(true)
                                .build());

        return options;
    }

    private static Options getTopLevelOptions()
    {
        Options options = new Options();
        options.addOption("read", "Reads the specified file(s) and writes a copy of the files to the local directory.");
        options.addOption("copy", "Copies the specified remote source file to the specified remote destination cluster / file.");
        options.addOption("write", "Writes the specified local source file to the specified remote destination cluster / file.");

        return options;
    }

    public static String[] findFilesMatching(String filePath) throws Exception
    {
        boolean isWildcard = filePath.endsWith("*");
        if (!isWildcard)
        {
            File file = new File(filePath);
            if (!file.exists())
            {
                throw new Exception("File path is invalid: " + filePath);
            }

            String[] res = {filePath};
            return res;
        }

        int indexOfSep = filePath.lastIndexOf(File.separator)+1;
        String dirStr = filePath.substring(0,indexOfSep);
        String filePattern = filePath.substring(indexOfSep,filePath.length()-1);

        File dir = new File(dirStr);
        if (!dir.isDirectory() || !dir.exists())
        {
            throw new Exception("File path is invalid: " + filePath);
        }

        List result = new ArrayList();
        for(File file : dir.listFiles())
        {
            String name = file.getName();
            boolean startsWithPattern = name.startsWith(filePattern);
            if (startsWithPattern)
            {
                result.add(file.getAbsolutePath());
            }
        }

        return result.toArray(new String[0]);
    }

    private static FileFormat getFormat(String[] srcFiles) throws Exception
    {
        return FileFormat.THOR;
    }

    private static String getFormatExtension(FileFormat format)
    {
        return "";
    }

    private static FieldDef getRecordDefinition(String[] srcFiles, FileFormat format) throws Exception
    {
        switch (format)
        {
            case THOR:
            {
                String metaFile = null;
                for (int i = 0; i < srcFiles.length; i++)
                {
                    String file = srcFiles[i].toLowerCase();
                    if (file.endsWith(".meta"))
                    {
                        metaFile = file;
                    }
                }

                if (metaFile == null)
                {
                    throw new Exception("Unable to find Thor meta-data file.");
                }

                byte[] metaData = Files.readAllBytes(Paths.get(metaFile));
                String metaStr = new String(metaData, Charset.defaultCharset());

                JSONObject metaJson = new JSONObject(metaStr);
                return RecordDefinitionTranslator.parseJsonRecordDefinition(metaJson);
            }
            case PARQUET:
            default:
                throw new Exception("File format: " + format + " is not currently supported");
        }
    }

    private static SplitTable[] getSplitTables(String[] srcFiles, FileFormat format) throws Exception
    {
        if (format != FileFormat.THOR)
        {
            return null;
        }

        String splitFile = null;
        for (int i = 0; i < srcFiles.length; i++)
        {
            String file = srcFiles[i].toLowerCase();
            if (file.endsWith(".split"))
            {
                splitFile = file;
                break;
            }
        }

        if (splitFile == null)
        {
            return null;
        }

        FileInputStream inStream = new FileInputStream(splitFile);
        SplitFile file = new SplitFile();
        file.load(inStream);
        inStream.close();

        return file.getSplitTableArray();
    }

    private static String[] filterFilesByFormat(String[] srcFiles, FileFormat format) throws Exception
    {
        Pattern pattern = null;
        switch (format)
        {
            case THOR:
            {
                pattern = Pattern.compile("^[^\\.]*\\._[0-9]+_of_[0-9]+");
                break;
            }
            case PARQUET:
            default:
                throw new Exception("File format: " + format + " is not currently supported");
        }

        List filteredFiles = new ArrayList();
        for (int i = 0; i < srcFiles.length; i++)
        {
            int indexOfSep = srcFiles[i].lastIndexOf(File.separator)+1;
            String fileName = srcFiles[i].substring(indexOfSep);

            if (pattern.matcher(fileName).matches())
            {
                filteredFiles.add(srcFiles[i]);
            }
        }

        return filteredFiles.toArray(new String[0]);
    }

    private static void executeTasks(Runnable[] tasks, int numThreads) throws Exception
    {
        if (tasks.length > numThreads)
        {
            numThreads = tasks.length;
        }

        int numTasksPerThread = tasks.length / numThreads;
        int numResidualTasks = tasks.length % numThreads;

        int taskNum = 0;
        Thread[] taskThreads = new Thread[numThreads];
        for (int threadNum = 0; threadNum < numThreads; threadNum++)
        {
            int residualTasks = 0;
            if (threadNum < numResidualTasks)
            {
                residualTasks = 1;
            }

            final int currentTaskStart = taskNum;
            final int currentNumTasks = numTasksPerThread + residualTasks;

            taskThreads[threadNum] = new Thread(new Runnable()
            {
                Runnable[] subTasks = tasks;
                int startingSubTask = currentTaskStart;
                int numSubTasks = currentNumTasks;

                public void run()
                {
                    for (int j = 0; j < numSubTasks; j++)
                    {
                        subTasks[startingSubTask + j].run();
                    }
                }
            });

            taskNum += currentNumTasks;
            taskThreads[threadNum].start();
        }

        for (int threadNum = 0; threadNum < numThreads; threadNum++)
        {
            taskThreads[threadNum].join();
        }
    }

    private static Runnable[] createReadToThorTasks(DataPartition[] fileParts, SplitTable[] splitTables, String[] outFilePaths, FieldDef recordDef, TaskContext context) throws Exception
    {
        Runnable[] tasks = new Runnable[fileParts.length];
        for (int i = 0; i < tasks.length; i++)
        {
            final int taskIndex = i;
            final HpccRemoteFileReader filePartReader = new HpccRemoteFileReader(fileParts[taskIndex], recordDef, new HPCCRecordBuilder(recordDef));

            final String filePath = outFilePaths[taskIndex];
            final FileOutputStream outStream = new FileOutputStream(filePath);

            final BinaryRecordWriter filePartWriter = new BinaryRecordWriter(outStream);
            filePartWriter.initialize(new HPCCRecordAccessor(recordDef));

            tasks[taskIndex] = new Runnable()
            {
                HpccRemoteFileReader fileReader = filePartReader;
                BinaryRecordWriter fileWriter = filePartWriter;
                FileOutputStream outputStream = outStream;
                SplitTable splitTable = splitTables[taskIndex];

                public void run()
                {
                    try
                    {
                        while (fileReader.hasNext())
                        {
                            splitTable.addRecordPosition(fileReader.getStreamPosition());
                            HPCCRecord record = fileReader.next();
                            fileWriter.writeRecord(record);
                            context.recordsRead.incrementAndGet();
                        }

                        splitTable.finish(fileReader.getStreamPosition());

                        fileReader.close();
                        context.bytesRead.addAndGet(fileReader.getStreamPosition());

                        fileWriter.finalize();
                        outputStream.close();
                    }
                    catch (Exception e)
                    {
                        context.addError("Error while reading file: '" + filePath + "'," + taskIndex + ": " + e.getMessage());
                        return;
                    }
                }
            };
        }

        return tasks;
    }

    private static Runnable[] createThorSplitTableTasks(String[] thorFiles, SplitTable[] splitTables, FieldDef recordDef, TaskContext context) throws Exception
    {
        Runnable[] tasks = new Runnable[thorFiles.length];
        for (int i = 0; i < tasks.length; i++)
        {
            final int taskIndex = i;
            final SplitTable splitTable = new SplitTable(DEFAULT_SPLIT_TABLE_SIZE);
            splitTables[taskIndex] = splitTable;

            BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(thorFiles[taskIndex]));
            BinaryRecordReader filePartReader = new BinaryRecordReader(bufferedInputStream);
            filePartReader.initialize(new HPCCRecordBuilder(recordDef));

            tasks[taskIndex] = new Runnable()
            {
                InputStream inputStream = bufferedInputStream;
                BinaryRecordReader fileReader = filePartReader;

                public void run()
                {
                    try
                    {
                        while (fileReader.hasNext())
                        {
                            splitTable.addRecordPosition(fileReader.getStreamPosAfterLastRecord());
                            HPCCRecord record = (HPCCRecord) fileReader.getNext();
                        }

                        splitTable.finish(fileReader.getStreamPosAfterLastRecord());
                        inputStream.close();
                    }
                    catch (Exception e)
                    {
                        context.addError("Error while writing file taskIndex: " + taskIndex + " - " + e.getMessage());
                        return;
                    }
                }
            };
        }

        return tasks;
    }

    private static Runnable[] createNonRedistributingCopyTasks(HPCCFile file, DFUCreateFileWrapper createResult, TaskContext context) throws Exception
    {
        FieldDef recordDef = null;
        DataPartition[] inFileParts  = null;
        DataPartition[] outFileParts = null;

        inFileParts = file.getFileParts();
        recordDef = file.getRecordDefinition();

        DFUFilePartWrapper[] dfuFileParts = createResult.getFileParts();
        NullRemapper remapper = new NullRemapper(new RemapInfo(), createResult.getFileAccessInfo());
        outFileParts = DataPartition.createPartitions(dfuFileParts, remapper, dfuFileParts.length, createResult.getFileAccessInfoBlob());

        int incomingPerOutgoing = inFileParts.length / outFileParts.length;
        int residualIncomingFileParts = inFileParts.length % outFileParts.length;

        int incomingFilePartIndex = 0;
        Runnable[] tasks = new Runnable[outFileParts.length];
        for (int i = 0; i < tasks.length; i++)
        {
            final int taskIndex = i;

            DataPartition outFilePart = outFileParts[taskIndex];

            final int numIncomingParts = incomingPerOutgoing + ((taskIndex < residualIncomingFileParts) ? 1 : 0);
            HpccRemoteFileReader[] filePartReaders = new HpccRemoteFileReader[numIncomingParts];

            for (int j = 0; j < numIncomingParts; j++)
            {
                DataPartition inFilePart = inFileParts[incomingFilePartIndex + j];
                filePartReaders[j] = new HpccRemoteFileReader(inFilePart, recordDef, new HPCCRecordBuilder(file.getProjectedRecordDefinition()));
            }
            incomingFilePartIndex += numIncomingParts;

            HPCCRecordAccessor recordAccessor = new HPCCRecordAccessor(recordDef);
            final HPCCRemoteFileWriter partFileWriter = new HPCCRemoteFileWriter(outFilePart, recordDef, recordAccessor, CompressionAlgorithm.NONE);

            tasks[taskIndex] = new Runnable()
            {
                HpccRemoteFileReader[] fileReaders = filePartReaders;
                HPCCRemoteFileWriter fileWriter = partFileWriter;

                public void run()
                {
                    try
                    {
                        for (int k = 0; k < fileReaders.length; k++)
                        {
                            HpccRemoteFileReader fileReader = fileReaders[k];
                            while (fileReader.hasNext())
                            {
                                HPCCRecord record = fileReader.next();
                                fileWriter.writeRecord(record);
                                context.recordsWritten.incrementAndGet();
                                context.recordsRead.incrementAndGet();
                            }

                            fileReader.close();
                            context.bytesRead.addAndGet(fileReader.getStreamPosition());
                        }
                        System.out.println("Closing file writer for task: " + taskIndex);
                        fileWriter.close();
                        context.bytesWritten.addAndGet(fileWriter.getBytesWritten());
                    }
                    catch (Exception e)
                    {
                        context.addError("Error while copying file: '" + file.getFileName() + "'," + taskIndex + ": " + e.getMessage());
                        return;
                    }
                }
            };
        }

        return tasks;
    }

    /*
     * Redistribution notes:
     * Download file locally and build split table, or build split table if one does not exist.
     * Create write with redistribution using the split table
     */

    private static class SplitEntryMapping
    {
        int startingSrcFile = 0;
        int splitEntryStart = 0;

        int endingSrcFile = 0;
        int splitEntryEnd = 0;
    }

    private static Runnable[] createWriteTasks(String[] srcFiles, SplitTable[] splitTables, FieldDef recordDef, FileFormat format, DFUCreateFileWrapper createResult, TaskContext context) throws Exception
    {
        DataPartition[] outFileParts = null;

        DFUFilePartWrapper[] dfuFileParts = createResult.getFileParts();
        NullRemapper remapper = new NullRemapper(new RemapInfo(), createResult.getFileAccessInfo());
        outFileParts = DataPartition.createPartitions(dfuFileParts, remapper, dfuFileParts.length, createResult.getFileAccessInfoBlob());

        // Determine mapping from split entries to output file parts
        SplitEntryMapping[] srcFileToOutPartsMapping = new SplitEntryMapping[outFileParts.length];
        if (srcFiles.length != outFileParts.length)
        {
            int totalSplitEntries = 0;
            for (int i = 0; i < splitTables.length; i++)
            {
                totalSplitEntries += splitTables[i].splits.size();
            }

            int splitsPerOutFile = totalSplitEntries / outFileParts.length;
            int residualSplits = totalSplitEntries % outFileParts.length;

            int currentSrcFile = 0;
            int currentSrcFileSplitStart = 0;
            int currentSrcFileSplitEnd = splitTables[0].splits.size();
            int splitStart = 0;
            for (int i = 0; i < srcFileToOutPartsMapping.length; i++)
            {
                int numSplits = splitsPerOutFile + ((i < residualSplits ) ? 1 : 0);

                SplitEntryMapping mapping = new SplitEntryMapping();
                mapping.startingSrcFile = currentSrcFile;
                mapping.splitEntryStart = splitStart - currentSrcFileSplitStart;

                int splitEnd = splitStart + numSplits;
                while (currentSrcFileSplitEnd < splitEnd)
                {
                    currentSrcFile++;
                    currentSrcFileSplitStart = currentSrcFileSplitEnd;
                    currentSrcFileSplitEnd += splitTables[currentSrcFile].splits.size();
                }

                mapping.endingSrcFile = currentSrcFile;
                mapping.splitEntryEnd = splitEnd - currentSrcFileSplitStart;
                srcFileToOutPartsMapping[i] = mapping;

                splitStart = splitEnd;
            }
        }
        else
        {
            for (int i = 0; i < srcFileToOutPartsMapping.length; i++)
            {
                SplitEntryMapping mapping = new SplitEntryMapping();
                mapping.startingSrcFile = i;
                mapping.splitEntryStart = 0;
                mapping.endingSrcFile = i;
                mapping.splitEntryEnd = splitTables[i].splits.size();
                srcFileToOutPartsMapping[i] = mapping;
            }
        }

        Runnable[] tasks = new Runnable[outFileParts.length];
        for (int i = 0; i < tasks.length; i++)
        {
            final int taskIndex = i;

            DataPartition outFilePart = outFileParts[taskIndex];
            HPCCRecordAccessor recordAccessor = new HPCCRecordAccessor(recordDef);
            HPCCRemoteFileWriter filePartWriter = new HPCCRemoteFileWriter(outFilePart, recordDef, recordAccessor, CompressionAlgorithm.NONE);

            tasks[taskIndex] = new Runnable()
            {
                SplitEntryMapping mapping = srcFileToOutPartsMapping[taskIndex];
                HPCCRemoteFileWriter fileWriter = filePartWriter;

                public void run()
                {
                    try
                    {
                        int numIncomingParts = (mapping.endingSrcFile+1) - mapping.startingSrcFile;
                        BinaryRecordReader[] fileReaders = new BinaryRecordReader[numIncomingParts];
                        BufferedInputStream[] inputStreams = new BufferedInputStream[numIncomingParts];

                        for (int j = 0; j < numIncomingParts; j++)
                        {
                            String srcFile = srcFiles[mapping.startingSrcFile + j];
                            inputStreams[j] = new BufferedInputStream(new FileInputStream(srcFile));

                            if (j == 0)
                            {
                                SplitEntry startingSplit = splitTables[mapping.startingSrcFile].splits.get(mapping.splitEntryStart);
                                fileReaders[j] = new BinaryRecordReader(inputStreams[j], startingSplit.splitStart);
                            }
                            else
                            {
                                fileReaders[j] = new BinaryRecordReader(inputStreams[j]);
                            }

                            fileReaders[j].initialize(new HPCCRecordBuilder(recordDef));
                        }

                        for (int j = 0; j < fileReaders.length; j++)
                        {
                            BinaryRecordReader fileReader = fileReaders[j];
                            long splitEnd = Long.MAX_VALUE;
                            if (j == (fileReaders.length-1))
                            {
                                SplitEntry endingSplit = splitTables[mapping.endingSrcFile].splits.get(mapping.splitEntryEnd-1);
                                splitEnd = endingSplit.splitEnd;
                            }

                            while (fileReader.hasNext() && fileReader.getStreamPosAfterLastRecord() < splitEnd)
                            {
                                HPCCRecord record = (HPCCRecord) fileReader.getNext();
                                fileWriter.writeRecord(record);
                                context.recordsWritten.incrementAndGet();
                                context.recordsRead.incrementAndGet();
                            }

                            context.bytesRead.addAndGet(fileReader.getStreamPosAfterLastRecord());
                            inputStreams[j].close();
                        }
                        fileWriter.close();
                        context.bytesWritten.addAndGet(fileWriter.getBytesWritten());
                    }
                    catch (Exception e)
                    {
                        context.addError("Error while writing file taskIndex: " + taskIndex + " - " + e.getMessage());
                        return;
                    }
                }
            };
        }

        return tasks;
    }

    private static void performRead(String[] args, TaskContext context)
    {
        Options options = getReadOptions();
        CommandLineParser parser = new DefaultParser();

        CommandLine cmd = null;
        try
        {
            cmd = parser.parse(options, args);
        }
        catch (ParseException e)
        {
            System.out.println("Error parsing commandline options:\n" + e.getMessage());
            return;
        }

        String connString = cmd.getOptionValue("url");
        String user = cmd.getOptionValue("user");
        String pass = cmd.getOptionValue("pass");

        String outputPath = cmd.getOptionValue("out",".");

        int numThreads = NUM_DEFAULT_THREADS;
        String numThreadsStr = cmd.getOptionValue("num_threads", "" + numThreads);
        try
        {
            numThreads = Integer.parseInt(numThreadsStr);
        }
        catch(Exception e)
        {
            System.out.println("Invalid option value for num_threads: "
                              + numThreadsStr + ", must be an integer. Defaulting to: " + NUM_DEFAULT_THREADS + " threads.");
        }

        String formatStr = cmd.getOptionValue("format");
        if (formatStr == null)
        {
            formatStr = "THOR";
        }

        FileFormat format = FileFormat.THOR;
        switch (formatStr.toUpperCase())
        {
            case "THOR":
                format = FileFormat.THOR;
                break;
            case "PARQUET":
                format = FileFormat.PARQUET;
                break;
            default:
                System.out.println("Error unsupported format specified: " + format);
                return;
        }

        String[] datasets = cmd.getOptionValues("read");
        for (int i = 0; i < datasets.length; i++)
        {
            String datasetName = datasets[i];
            context.startOperation("Read " + datasetName);

            HPCCFile file = null;
            try
            {
                file = new HPCCFile(datasetName, connString, user, pass);
            }
            catch (Exception e)
            {
                System.out.println("Error while attempting to open file: '" + datasetName + "': " + e.getMessage());
                return;
            }

            DataPartition[] fileParts = null;
            FieldDef recordDef = null;
            try
            {
                fileParts = file.getFileParts();
                recordDef = file.getRecordDefinition();
            }
            catch (Exception e)
            {
                System.out.println("Error while retrieving file parts for: '" + datasetName + "': " + e.getMessage());
                return;
            }

            // Figure out the format string needed based on the number of file parts
            String lenStr = "" + fileParts.length;
            String fileNumFormat = "%0" + lenStr.length() + "d";
            String fileExt = getFormatExtension(format);

            File outDir = new File(outputPath);
            if (!outDir.exists())
            {
                outDir.mkdirs();
            }

            SplitTable[] splitTables = new SplitTable[fileParts.length];
            String[] outFilePaths = new String[fileParts.length];
            for (int j = 0; j < fileParts.length; j++)
            {
                String fileName = file.getFileName().replace(":","_");
                outFilePaths[j] = outputPath + File.separator + fileName + "._" + String.format(fileNumFormat, j+1) + "_of_" + fileParts.length + fileExt;

                splitTables[j] = new SplitTable(DEFAULT_SPLIT_TABLE_SIZE);
            }

            Runnable[] tasks = null;
            try
            {
                switch (format)
                {
                    case THOR:
                        tasks = createReadToThorTasks(fileParts, splitTables, outFilePaths, recordDef, context);
                        break;
                    case PARQUET:
                    default:
                        throw new Exception("Error unsupported format specified: " + format);
                };
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to create read tasks for file: '" + datasetName + "': " + e.getMessage());
                return;
            }

            try
            {
                executeTasks(tasks, numThreads);
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to execute read tasks for file: '" + datasetName + "': " + e.getMessage());
                return;
            }

            if (context.hasError())
            {
                return;
            }

            try
            {
                String fileName = file.getFileName().replace(":","_");
                String filePath = outputPath + File.separator + fileName + ".meta";
                FileOutputStream metaFile = new FileOutputStream(filePath);

                String metaStr = RecordDefinitionTranslator.toJsonRecord(file.getRecordDefinition()).toString();
                metaFile.write(metaStr.getBytes());
                metaFile.close();
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to write meta-data for file: '" + datasetName + "': " + e.getMessage());
                return;
            }

            try
            {
                String fileName = file.getFileName().replace(":","_");
                String filePath = outputPath + File.separator + fileName + ".split";
                FileOutputStream splitFileOut = new FileOutputStream(filePath);

                SplitFile splitFile = new SplitFile(splitTables);
                splitFile.save(splitFileOut);
                splitFileOut.close();
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to write split table file for dataset: '" + datasetName + "': " + e.getMessage());
                return;
            }

            context.endOperation();
        }
    }

    private static void performCopy(String[] args, TaskContext context)
    {
        Options options = getCopyOptions();
        CommandLineParser parser = new DefaultParser();

        CommandLine cmd = null;
        try
        {
            cmd = parser.parse(options, args);
        }
        catch (ParseException e)
        {
            System.out.println("Error parsing commandline options:\n" + e.getMessage());
            return;
        }

        int numThreads = NUM_DEFAULT_THREADS;
        String numThreadsStr = cmd.getOptionValue("num_threads", "" + numThreads);
        try
        {
            numThreads = Integer.parseInt(numThreadsStr);
        }
        catch(Exception e)
        {
            System.out.println("Invalid option value for num_threads: "
                              + numThreadsStr + ", must be an integer. Defaulting to: " + NUM_DEFAULT_THREADS + " threads.");
        }

        String user = cmd.getOptionValue("user");
        String pass = cmd.getOptionValue("pass");

        String destClusterName = cmd.getOptionValue("dest_cluster");

        String srcURL = cmd.getOptionValue("url");
        String destURL = cmd.getOptionValue("dest_url");
        if (destURL == null)
        {
            destURL = srcURL;
        }

        Connection destConn = null;
        try
        {
            destConn = new Connection(destURL);
        }
        catch (Exception e)
        {
            System.out.println("Error while attempting to construct connection: " + e.getMessage());
            return;
        }

        destConn.setCredentials(user, pass);
        Platform platform = Platform.get(destConn);
        HPCCWsClient wsclient = null;

        try
        {
            wsclient = platform.checkOutHPCCWsClient();
        }
        catch (Exception e)
        {
            System.out.println("Error while attempting to connect to platform: " + e.getMessage());
            return;
        }

        HPCCWsDFUClient dfuClient = wsclient.getWsDFUClient();

        String[] copyPairs = cmd.getOptionValues("copy");
        if ((copyPairs.length % 2) != 0)
        {
            System.out.println("Error copy operation must specify both a source and destination file pairs separated by a space.");
            return;
        }

        for (int i = 0; i < copyPairs.length; i+=2)
        {
            String srcFile = copyPairs[i];
            String destFile = copyPairs[i+1];

            context.startOperation("Copy " + srcFile + " -> " + destFile);

            HPCCFile file = null;
            try
            {
                file = new HPCCFile(srcFile, srcURL, user, pass);
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to open file: '" + srcFile + "': " + e.getMessage());
                return;
            }

            DataPartition[] srcFileParts = null;
            try
            {
                srcFileParts = file.getFileParts();
            }
            catch (HpccFileException e)
            {
                context.addError("Error while retrieving file parts for: '" + srcFile + "': " + e.getMessage());
            }

            boolean shouldRedistribute = true;
            if (!shouldRedistribute)
            {
                int expirySecs = 300;
                DFUCreateFileWrapper createResult = null;
                String eclRecordDefn = null;
                try
                {
                    eclRecordDefn = RecordDefinitionTranslator.toECLRecord(file.getRecordDefinition());
                    createResult = dfuClient.createFile(destFile, destClusterName, eclRecordDefn,
                                expirySecs, false, DFUFileTypeWrapper.Flat, "");
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to start file creation for: '" + destFile + "': " + e.getMessage());
                    return;
                }

                Runnable[] tasks = null;
                try
                {
                    tasks = createNonRedistributingCopyTasks(file, createResult, context);
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to create copy tasks for file: '" + srcFile + "': " + e.getMessage());
                    return;
                }

                try
                {
                    executeTasks(tasks, numThreads);
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to execute copy tasks for file: '" + srcFile + "': " + e.getMessage());
                    return;
                }

                if (context.hasError())
                {
                    return;
                }

                try
                {
                    long bytesWritten = context.bytesWritten.get();
                    long recordsWritten = context.recordsWritten.get();
                    dfuClient.publishFile(createResult.getFileID(), eclRecordDefn, recordsWritten, bytesWritten, true);
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to publish file: '" + destFile + "': " + e.getMessage());
                    return;
                }
            }
            else
            {
                String readArgs[] = {"-read", srcFile, "-url", srcURL,
                                "-format", "thor", "-user", user, "-pass", pass,
                                "-out", "tmp-read"};

                performRead(readArgs, context);

                String writeArgs[] = {"-write", "tmp-read" + File.separator +  srcFile.replace(':', '_') + "*" +  " " + destFile,
                                "-url", srcURL, "-dest_url", destURL,
                                "-dest_cluster", destClusterName,
                                "-user", user, "-pass", pass };

                performWrite(writeArgs, context);
            }

            context.endOperation();
        }
    }

    private static void performWrite(String[] args, TaskContext context)
    {
        Options options = getWriteOptions();
        CommandLineParser parser = new DefaultParser();

        CommandLine cmd = null;
        try
        {
            cmd = parser.parse(options, args);
        }
        catch (ParseException e)
        {
            System.out.println("Error parsing commandline options:\n" + e.getMessage());
            return;
        }

        int numThreads = NUM_DEFAULT_THREADS;
        String numThreadsStr = cmd.getOptionValue("num_threads", "" + numThreads);
        try
        {
            numThreads = Integer.parseInt(numThreadsStr);
        }
        catch(Exception e)
        {
            System.out.println("Invalid option value for num_threads: "
                              + numThreadsStr + ", must be an integer. Defaulting to: " + NUM_DEFAULT_THREADS + " threads.");
        }

        String user = cmd.getOptionValue("user");
        String pass = cmd.getOptionValue("pass");

        String destClusterName = cmd.getOptionValue("dest_cluster");

        String srcURL = cmd.getOptionValue("url");
        String destURL = cmd.getOptionValue("dest_url");
        if (destURL == null)
        {
            destURL = srcURL;
        }

        Connection destConn = null;
        try
        {
            destConn = new Connection(destURL);
        }
        catch (Exception e)
        {
            System.out.println("Error while attempting to construct connection: " + e.getMessage());
            return;
        }

        destConn.setCredentials(user, pass);
        Platform platform = Platform.get(destConn);
        HPCCWsClient wsclient = null;

        try
        {
            wsclient = platform.checkOutHPCCWsClient();
        }
        catch (Exception e)
        {
            System.out.println("Error while attempting to connect to platform: " + e.getMessage());
            return;
        }

        HPCCWsDFUClient dfuClient = wsclient.getWsDFUClient();

        String[] writePairs = cmd.getOptionValues("write");
        if ((writePairs.length % 2) != 0)
        {
            System.out.println("Error write operation must specify both a source and destination file pairs separated by a space.");
            return;
        }

        for (int pairIdx = 0; pairIdx < writePairs.length; pairIdx += 2)
        {
            String srcFile = writePairs[pairIdx];
            String destFile = writePairs[pairIdx+1];

            context.startOperation("Write " + srcFile + " -> " + destFile);

            SplitTable[] splitTables = null;
            String[] srcFiles = null;
            FileFormat format = FileFormat.THOR;
            FieldDef recordDefinition = null;
            try
            {
                srcFiles = findFilesMatching(srcFile);
                format = getFormat(srcFiles);
                recordDefinition = getRecordDefinition(srcFiles, format);
                splitTables = getSplitTables(srcFiles, format);
                srcFiles = filterFilesByFormat(srcFiles, format);
            }
            catch (Exception e)
            {
                context.addError("Error while constructing source file list: " + e.getMessage());
                return;
            }

            Arrays.sort(srcFiles);
            if (srcFiles.length == 0)
            {
                context.addError("Error no files matching: " + srcFile);
                return;
            }

            boolean needToCreateSplitTable = (splitTables == null || splitTables.length == 0) && format == FileFormat.THOR;
            if (needToCreateSplitTable)
            {
                Runnable[] tasks = null;
                try
                {
                    splitTables = new SplitTable[srcFiles.length];
                    tasks = createThorSplitTableTasks(srcFiles, splitTables, recordDefinition, context);
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to create split table creation tasks for file: '" + srcFile + "': " + e.getMessage());
                    return;
                }

                try
                {
                    executeTasks(tasks, numThreads);
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to execute create split table creation tasks for file: '" + srcFile + "': " + e.getMessage());
                    return;
                }
            }

            if (needToCreateSplitTable)
            {
                try
                {
                    String fileName = srcFiles[0].substring(0,srcFiles[0].lastIndexOf('.'));
                    String filePath = fileName + ".split";
                    FileOutputStream splitFileOut = new FileOutputStream(filePath);

                    SplitFile splitFile = new SplitFile(splitTables);
                    splitFile.save(splitFileOut);
                    splitFileOut.close();
                }
                catch (Exception e)
                {
                    context.addError("Error while attempting to write split table file for dataset: '" + srcFile + "': " + e.getMessage());
                    return;
                }
            }

            int expirySecs = 300;
            DFUCreateFileWrapper createResult = null;
            String eclRecordDefn = null;
            try
            {
                eclRecordDefn = RecordDefinitionTranslator.toECLRecord(recordDefinition);
                createResult = dfuClient.createFile(destFile, destClusterName, eclRecordDefn,
                            expirySecs, false, DFUFileTypeWrapper.Flat, "");
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to start file creation for: '" + destFile + "': " + e.getMessage());
                return;
            }

            Runnable[] tasks = null;
            try
            {
                tasks = createWriteTasks(srcFiles, splitTables, recordDefinition, format, createResult, context);
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to create write tasks for file: '" + srcFile + "': " + e.getMessage());
                return;
            }

            try
            {
                executeTasks(tasks, numThreads);
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to execute write tasks for file: '" + srcFile + "': " + e.getMessage());
                return;
            }

            if (context.hasError())
            {
                return;
            }

            try
            {
                long bytesWritten = context.bytesWritten.get();
                long recordsWritten = context.recordsWritten.get();
                dfuClient.publishFile(createResult.getFileID(), eclRecordDefn, recordsWritten, bytesWritten, true);
            }
            catch (Exception e)
            {
                context.addError("Error while attempting to publish file: '" + destFile + "': " + e.getMessage());
                return;
            }

            context.endOperation();
        }
    }

    /**
     * Executes an operation based on the provided args.
     * @param args Operation args
     * @return JSONArray
     */
    public static JSONArray run(String[] args)
    {
        Options options = getTopLevelOptions();
        CommandLineParser parser = new DefaultParser();

        CommandLine cmd = null;
        try
        {
            boolean stopAtNonOption = false;

            // Stop at non-option doesn't seem to work 1.5, so we are only taking the first arg to prevent unknown option exception
            String[] truncatedArgs = new String[1];
            truncatedArgs[0] = args[0];
            cmd = parser.parse(options, truncatedArgs, stopAtNonOption);
        }
        catch (ParseException e)
        {
            System.out.println("Error parsing commandline options:\n" + e.getMessage());
            return new JSONArray();
        }

        TaskContext context = new TaskContext();
        if (cmd.hasOption("read"))
        {
            performRead(args, context);
        }
        else if (cmd.hasOption("copy"))
        {
            performCopy(args, context);
        }
        else if (cmd.hasOption("write"))
        {
            performWrite(args, context);
        }

        // If we are still in the middle of an operation there was a failure
        if (context.hasOperation())
        {
            boolean succeded = false;
            context.endOperation(succeded);
        }

        return context.generateResultsMessage();
    }

    public static void main(String[] args)
    {
        JSONArray results = run(args);

        System.out.println("Results:\n--------------------------------------------------\n");
        System.out.println(results.toString(2));

        return;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy