All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.hub.step.impl.FileCollector Maven / Gradle / Ivy

There is a newer version: 6.1.1
Show newest version
package com.marklogic.hub.step.impl;

import com.marklogic.client.ext.helper.LoggingObject;
import com.marklogic.hub.HubClientConfig;
import com.marklogic.hub.util.DiskQueue;
import org.apache.commons.io.FilenameUtils;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;

public class FileCollector extends LoggingObject {

    private final String inputFormat;
    private final Set textExts = new HashSet<>(Collections.singletonList("txt"));
    private final Set jsonExts = new HashSet<>(Collections.singletonList("json"));
    private final Set csvExts = new HashSet<>(Arrays.asList("txt", "csv", "tsv", "psv"));
    private final Set xmlExts = new HashSet<>(Arrays.asList("xml", "xhtml", "html"));
    private final Map> fileFormats;
    private final HubClientConfig hubClientConfig;

    public FileCollector(String inputFormat) {
        this(inputFormat, null);
    }

    public FileCollector(String inputFormat, HubClientConfig hubClientConfig) {
        this.inputFormat = inputFormat.toLowerCase();

        fileFormats = new HashMap<>();
        fileFormats.put("text", textExts);
        fileFormats.put("json", jsonExts);
        fileFormats.put("csv", csvExts);
        fileFormats.put("xml", xmlExts);
        this.hubClientConfig = hubClientConfig;
    }

    public DiskQueue run(Path dirPath) {
        if (!(Files.exists(dirPath)) || !(Files.isDirectory(dirPath))) {
            throw new RuntimeException("The path doesn't exist or is not a directory: " + dirPath);
        }

        DiskQueue results;
        try {
            results = new DiskQueue<>(hubClientConfig);

            if (logger.isInfoEnabled()) {
                logger.info("Finding files in directory: " + dirPath);
            }

            try (Stream files = Files.find(dirPath,
                Integer.MAX_VALUE,
                (filePath, fileAttr) -> fileAttr.isRegularFile())) {
                files.forEach(path -> {
                    File file = path.toFile();
                    if (acceptFile(file.getName())) {
                        results.add(file.getAbsolutePath());
                    }
                });
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return results;
    }

    protected boolean acceptFile(String filename) {
        if (filename == null) {
            return false;
        }

        final String fileExtension = FilenameUtils.getExtension(filename).toLowerCase();

        if (fileExtension.trim().isEmpty() && ("json".equals(inputFormat) || "xml".equals(inputFormat))) {
            return true;
        }

        if (fileFormats.containsKey(inputFormat) && fileFormats.get(inputFormat).contains(fileExtension)) {
            return true;
        }

        return "binary".equals(inputFormat)
            && !csvExts.contains(fileExtension)
            && !jsonExts.contains(fileExtension)
            && !xmlExts.contains(fileExtension);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy