All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.embulk.exec.SamplingParserPlugin Maven / Gradle / Ivy

package org.embulk.exec;

import static java.util.Locale.ENGLISH;
import static org.embulk.spi.util.Inputs.each;

import com.google.common.base.Preconditions;
import java.text.NumberFormat;
import java.util.List;
import org.embulk.config.Config;
import org.embulk.config.ConfigDefault;
import org.embulk.config.ConfigSource;
import org.embulk.config.Task;
import org.embulk.config.TaskReport;
import org.embulk.config.TaskSource;
import org.embulk.spi.Buffer;
import org.embulk.spi.BufferImpl;
import org.embulk.spi.Exec;
import org.embulk.spi.FileInput;
import org.embulk.spi.FileInputRunner;
import org.embulk.spi.InputPlugin;
import org.embulk.spi.Page;
import org.embulk.spi.PageOutput;
import org.embulk.spi.ParserPlugin;
import org.embulk.spi.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * Used by FileInputRunner.guess
 */
public class SamplingParserPlugin implements ParserPlugin {
    public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig) {
        return runFileInputSampling(runner, inputConfig, Exec.newConfigSource());
    }

    public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig, ConfigSource sampleBufferConfig) {
        final SampleBufferTask sampleBufferTask = loadSampleBufferTask(sampleBufferConfig);

        // override in.parser.type so that FileInputRunner creates SamplingParserPlugin
        ConfigSource samplingInputConfig = inputConfig.deepCopy();
        samplingInputConfig.getNestedOrSetEmpty("parser")
                .set("type", "system_sampling")
                .set("sample_buffer_bytes", sampleBufferTask.getSampleBufferBytes());
        samplingInputConfig.set("decoders", null);

        try {
            runner.transaction(samplingInputConfig, new InputPlugin.Control() {
                    public List run(TaskSource taskSource, Schema schema, int taskCount) {
                        if (taskCount == 0) {
                            throw new NoSampleException("No input files to read sample data");
                        }
                        int maxSize = -1;
                        int maxSizeTaskIndex = -1;
                        for (int taskIndex = 0; taskIndex < taskCount; taskIndex++) {
                            try {
                                runner.run(taskSource, schema, taskIndex, new PageOutput() {
                                        @Override
                                        public void add(Page page) {
                                            throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");  // TODO exception class
                                        }

                                        public void finish() {}

                                        public void close() {}
                                    });
                            } catch (NotEnoughSampleError ex) {
                                if (maxSize < ex.getSize()) {
                                    maxSize = ex.getSize();
                                    maxSizeTaskIndex = taskIndex;
                                }
                                continue;
                            }
                        }
                        if (maxSize <= 0) {
                            throw new NoSampleException("All input files are empty");
                        }
                        taskSource.getNested("ParserTaskSource").set("force", true);
                        try {
                            runner.run(taskSource, schema, maxSizeTaskIndex, new PageOutput() {
                                    @Override
                                    public void add(Page page) {
                                        throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");  // TODO exception class
                                    }

                                    public void finish() {}

                                    public void close() {}
                                });
                        } catch (NotEnoughSampleError ex) {
                            throw new NoSampleException("All input files are smaller than minimum sampling size");
                        }
                        throw new NoSampleException("All input files are smaller than minimum sampling size");
                    }
                });
            throw new AssertionError("SamplingParserPlugin must throw SampledNoticeError");
        } catch (SampledNoticeError error) {
            return error.getSample();
        }
    }

    public static class SampledNoticeError extends Error {
        private final Buffer sample;

        public SampledNoticeError(Buffer sample) {
            this.sample = sample;
        }

        public Buffer getSample() {
            return sample;
        }
    }

    public static class NotEnoughSampleError extends Error {
        private final int size;

        public NotEnoughSampleError(int size) {
            this.size = size;
        }

        public int getSize() {
            return size;
        }
    }

    private static final Logger logger = LoggerFactory.getLogger(SamplingParserPlugin.class);

    private final NumberFormat numberFormat = NumberFormat.getNumberInstance(ENGLISH);
    private final int minSampleBufferBytes;

    public interface PluginTask extends Task, SampleBufferTask {}

    public interface SampleBufferTask extends Task {
        @Config("sample_buffer_bytes")
        @ConfigDefault("32768") // 32 * 1024
        public int getSampleBufferBytes();
    }

    public SamplingParserPlugin() {
        this.minSampleBufferBytes = 40;  // empty gzip file is 33 bytes. // TODO get sample size from system config
    }

    @Override
    public void transaction(ConfigSource config, ParserPlugin.Control control) {
        final PluginTask task = loadPluginTask(config);
        Preconditions.checkArgument(minSampleBufferBytes < task.getSampleBufferBytes(), "minSampleBufferBytes must be smaller than sample_buffer_bytes");

        logger.info("Try to read {} bytes from input source", numberFormat.format(task.getSampleBufferBytes()));
        control.run(task.dump(), null);
    }

    @Override
    public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) {
        final PluginTask task = loadPluginTaskFromTaskSource(taskSource);
        Buffer buffer = readSample(input, task.getSampleBufferBytes());
        if (!taskSource.get(boolean.class, "force", false)) {
            if (buffer.limit() < minSampleBufferBytes) {
                throw new NotEnoughSampleError(buffer.limit());
            }
        }
        throw new SampledNoticeError(buffer);
    }

    public static Buffer readSample(FileInput fileInput, int sampleBufferBytes) {
        return readSample(fileInput, BufferImpl.allocate(sampleBufferBytes), 0, sampleBufferBytes);
    }

    public static Buffer readSample(FileInput fileInput, Buffer sample, int offset, int sampleBufferBytes) {
        if (!fileInput.nextFile()) {
            // no input files
            return sample;
        }

        try {
            for (Buffer buffer : each(fileInput)) {
                int size = Math.min(buffer.limit(), sample.capacity() - offset);
                sample.setBytes(offset, buffer, 0, size);
                offset += size;
                buffer.release();
                if (offset >= sampleBufferBytes) {
                    break;
                }
            }
        } finally {
            sample.limit(offset);
        }
        return sample;
    }

    @SuppressWarnings("deprecation") // https://github.com/embulk/embulk/issues/1301
    private static SampleBufferTask loadSampleBufferTask(final ConfigSource config) {
        return config.loadConfig(SampleBufferTask.class);
    }

    @SuppressWarnings("deprecation") // https://github.com/embulk/embulk/issues/1301
    private static PluginTask loadPluginTask(final ConfigSource config) {
        return config.loadConfig(PluginTask.class);
    }

    @SuppressWarnings("deprecation") // https://github.com/embulk/embulk/issues/1301
    private static PluginTask loadPluginTaskFromTaskSource(final TaskSource taskSource) {
        return taskSource.loadTask(PluginTask.class);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy