com.marklogic.client.datamovement.JacksonCSVSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-client-api Show documentation
The official MarkLogic Java client API.
The newest version!
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */

package com.marklogic.client.datamovement;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.Iterator;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.UUID;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.dataformat.csv.CsvMapper;
import com.fasterxml.jackson.dataformat.csv.CsvParser;
import com.fasterxml.jackson.dataformat.csv.CsvSchema;
import com.marklogic.client.MarkLogicIOException;
import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.impl.DocumentWriteOperationImpl;
import com.marklogic.client.io.JacksonHandle;

/**
 * The JacksonCSVSplitter class uses the Jackson CSV parser without attempting to abstract it capabilities.
 * The application can override defaults by configuring the Jackson ObjectReader and CsvSchema including parsing TSV
 */
public class JacksonCSVSplitter implements Splitter {
    private CsvSchema csvSchema = null;
    private CsvMapper csvMapper;
    private long count = 0;
    private ArrayNode headers = null;

    /**
     * The CsvMapper configured for the current instance.
     * @return the CsvMapper for the current instance.
     */
    public CsvMapper getCsvMapper() {
        return csvMapper;
    }

    /**
     * Used to set the CsvSchema for the current instance.
     * @param schema is the CsvSchema passed in.
     * @return an instance of JacksonCSVSplitter with CsvSchema set to the parameter.
     */
    public JacksonCSVSplitter withCsvSchema(CsvSchema schema) {
        this.csvSchema = schema;
        return this;
    }

    /**
     * Used to set the CsvMapper for the current instance.
     * @param mapper is the CsvMapper passed in.
     * @return an instance of JacksonCSVSplitter with CsvMapper set to the parameter.
     */
    public JacksonCSVSplitter withCsvMapper(CsvMapper mapper) {
        this.csvMapper = mapper;
        return this;
    }

    /**
     * The CsvSchema configured for the current instance.
     * @return the CsvSchema for the current instance.
     */
    public CsvSchema getCsvSchema() {
        return csvSchema;
    }

    private CsvMapper configureCsvMapper() {
        if(csvMapper == null) {
        csvMapper = new CsvMapper()
                .configure(CsvParser.Feature.ALLOW_TRAILING_COMMA, true)
                .configure(CsvParser.Feature.FAIL_ON_MISSING_COLUMNS, false)
                .configure(CsvParser.Feature.IGNORE_TRAILING_UNMAPPABLE, false)
                .configure(CsvParser.Feature.INSERT_NULLS_FOR_MISSING_COLUMNS, false)
                .configure(CsvParser.Feature.SKIP_EMPTY_LINES, true)
                .configure(CsvParser.Feature.TRIM_SPACES, true)
                .configure(CsvParser.Feature.WRAP_AS_ARRAY, false)
                .configure(CsvParser.Feature.IGNORE_TRAILING_UNMAPPABLE, true);
        }
        return csvMapper;
    }

    /**
     * Takes the input stream and converts it into a stream of JacksonHandle by setting the schema
     *  and wrapping the JsonNode into JacksonHandle.
     * @param input the input stream passed in.
     * @return a stream of JacksonHandle.
     * @throws IOException if the input cannot be split
     */
    @Override
    public Stream split(InputStream input) throws IOException {
        if(input == null) {
            throw new IllegalArgumentException("InputSteam cannot be null.");
        }
        return configureInput(configureObjReader().readValues(input));
    }

    /**
     * Takes the input stream and converts it into a stream of JacksonHandle by setting the schema
     *  and wrapping the JsonNode into JacksonHandle.
     * @param input the Reader stream passed in.
     * @return a stream of JacksonHandle.
     * @throws IOException if the input cannot be split
     */
    public Stream split(Reader input) throws IOException {
        if(input == null) {
            throw new IllegalArgumentException("Input cannot be null.");
        }
        Iterator nodeItr = configureObjReader().readValues(input);
        return configureInput(nodeItr);
    }


    /**
     * Takes the input stream and converts it into a stream of DocumentWriteOperation by setting the schema
     * and wrapping the JsonNode into DocumentWriteOperation.
     * @param input is the incoming input stream.
     * @return a stream of DocumentWriteOperation.
     * @throws Exception if the input cannot be split
     */
    @Override
    public Stream splitWriteOperations(InputStream input) throws Exception {
        return splitWriteOperations(input, null);
    }

    /**
     * Takes the input stream and the input name, then converts the input into a stream of DocumentWriteOperation
     * by setting the schema and wrapping the JsonNode into DocumentWriteOperation.
     * @param input is the incoming input stream.
     * @param splitFilename the name of the input stream, including name and extension. It is used to generate URLs for
     *                  split files.The splitFilename could either be provided here or in user-defined UriMaker.
     * @return a stream of DocumentWriteOperation.
     * @throws Exception if the input cannot be split
     */
    @Override
    public Stream splitWriteOperations(InputStream input, String splitFilename) throws Exception {
        if (input == null) {
            throw new IllegalArgumentException("Input cannot be null");
        }

        if (getUriMaker() == null) {
            JacksonCSVSplitter.UriMakerImpl uriMaker = new UriMakerImpl();
            setUriMaker(uriMaker);
        }

        if (splitFilename != null) {
            getUriMaker().setSplitFilename(splitFilename);
        }

        Iterator nodeItr = configureObjReader().readValues(input);
        return configureInputDocumentWriteOperation(nodeItr);
    }

    /**
     * Takes the input Reader and converts it into a stream of DocumentWriteOperation by setting the schema
     * and wrapping the JsonNode into DocumentWriteOperation.
     * @param input is the incoming input Reader.
     * @return a stream of DocumentWriteOperation.
     * @throws Exception if the input cannot be split
     */
    public Stream splitWriteOperations(Reader input) throws Exception {
        return splitWriteOperations(input, null);
    }

    /**
     * Takes the input Reader and the input name, then converts the input Reader into a stream of DocumentWriteOperation
     * by setting the schema and wrapping the JsonNode into DocumentWriteOperation.
     * @param input is the incoming input Reader.
     * @param splitFilename the name of the input Reader, including name and extension. It is used to generate URLs for
     *                  split files.The splitFilename could either be provided here or in user-defined UriMaker.
     * @return a stream of DocumentWriteOperation.
     * @throws Exception if the input cannot be split
     */
    public Stream splitWriteOperations(Reader input, String splitFilename) throws Exception {
        if (input == null) {
            throw new IllegalArgumentException("Input cannot be null");
        }

        if (getUriMaker() == null) {
            JacksonCSVSplitter.UriMakerImpl uriMaker = new UriMakerImpl();
            setUriMaker(uriMaker);
        }

        if (splitFilename != null) {
            getUriMaker().setSplitFilename(splitFilename);
        }

        //for case file.csv, to generate uris with extension "json"
        //for default UriMaker only, not custom UriMaker
        if (getUriMaker() instanceof JacksonCSVSplitter.UriMakerImpl) {
            ((UriMakerImpl) getUriMaker()).setExtension("json");
        }

        Iterator nodeItr = configureObjReader().readValues(input);
        return configureInputDocumentWriteOperation(nodeItr);
    }

    /**
     * The number of JsonNodes found so far.
     * @return the number of JsonNodes found in the input stream.
     */
    @Override
    public long getCount() {
        return this.count;
    }

    /**
     * The headers of the csv file.
     * @return the headers found in the csv file.
     */
    public ArrayNode getHeaders() {
        return this.headers;
    }

    private void incrementCount() {
        this.count++;
    }

    private ObjectReader configureObjReader() {
        this.count=0;
        CsvSchema firstLineSchema = getCsvSchema()!=null? getCsvSchema():CsvSchema.emptySchema().withHeader();
        CsvMapper csvMapper = getCsvMapper()!=null ? getCsvMapper() : configureCsvMapper();
        ObjectReader objectReader = csvMapper.readerFor(JsonNode.class);

        return objectReader.with(firstLineSchema);
    }

    private JacksonHandle wrapJacksonHandle(JsonNode content) {
        incrementCount();
        return new JacksonHandle(content);
    }

    private DocumentWriteOperation wrapDocumentWriteOperation(JsonNode content) {
        JacksonHandle handle = wrapJacksonHandle(content);
        String uri = uriMaker.makeUri(count, handle);

        return new DocumentWriteOperationImpl(
                DocumentWriteOperation.OperationType.DOCUMENT_WRITE,
                uri,
                null,
                handle
        );
    }

    private PeekingIterator configureSplitObj(Iterator nodeItr){
        if (nodeItr == null || !nodeItr.hasNext()) {
            throw new MarkLogicIOException("No header found.");
        }
        PeekingIterator peekingIterator = new PeekingIterator(nodeItr);

        Iterator headerValue = peekingIterator.getFirst().fieldNames();
        this.headers = new ObjectMapper().createArrayNode();
        while (headerValue.hasNext()) {
            headers.add(headerValue.next());
        }

        return peekingIterator;
    }

    private Stream configureInput(Iterator nodeItr) {

        if(getCsvSchema() == null) {
            PeekingIterator peekingIterator = configureSplitObj(nodeItr);
            return StreamSupport.stream(Spliterators.spliteratorUnknownSize(peekingIterator, Spliterator.ORDERED), false).map(this::wrapJacksonHandle);
        }
        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(nodeItr, Spliterator.ORDERED), false).map(this::wrapJacksonHandle);
    }

    private Stream configureInputDocumentWriteOperation(Iterator nodeItr) {
        if(getCsvSchema() == null) {
            PeekingIterator peekingIterator = configureSplitObj(nodeItr);
            return StreamSupport.stream(Spliterators.spliteratorUnknownSize(peekingIterator, Spliterator.ORDERED), false).map(this::wrapDocumentWriteOperation);
        }
        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(nodeItr, Spliterator.ORDERED), false).map(this::wrapDocumentWriteOperation);
    }

    private JacksonCSVSplitter.UriMaker uriMaker;

    /**
     * Get the UriMaker of the splitter
     * @return the UriMaker of the splitter
     */
    public JacksonCSVSplitter.UriMaker getUriMaker() {
        return this.uriMaker;
    }

    /**
     * Set the UriMaker to the splitter
     * @param uriMaker the uriMaker to generate URI of each split file.
     */
    public void setUriMaker(JacksonCSVSplitter.UriMaker uriMaker) {
        this.uriMaker = uriMaker;
    }

    /**
     * UriMaker which generates URI for each split file
     */
    public interface UriMaker extends Splitter.UriMaker {
        /**
         * Generates URI for each split
         * @param num the count of each split
         * @param handle the handle which contains the content of each split. It could be utilized to make a meaningful
         *               document URI.
         * @return the generated URI of current split
         */
        String makeUri(long num, JacksonHandle handle);
    }

    private static class UriMakerImpl extends com.marklogic.client.datamovement.impl.UriMakerImpl implements UriMaker {
        @Override
        public String makeUri(long num, JacksonHandle handle) {
            StringBuilder uri = new StringBuilder();

            if (getInputAfter() != null && getInputAfter().length() != 0) {
                uri.append(getInputAfter());
            }

            if (getSplitFilename() != null && getSplitFilename().length() != 0) {
                uri.append(getName());
            }

            if (uri.length() == 0) {
                uri.append("/");
            }

            uri.append(num).append("_").append(UUID.randomUUID()).append(".json");
            return uri.toString();
        }
    }
}