org.icij.extract.tasks.LoadQueueTask Maven / Gradle / Ivy
package org.icij.extract.tasks;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.icij.extract.document.DocumentFactory;
import org.icij.extract.json.DocumentQueueDeserializer;
import org.icij.extract.queue.DocumentQueue;
import org.icij.extract.queue.DocumentQueueFactory;
import org.icij.task.DefaultTask;
import org.icij.task.annotation.Option;
import org.icij.task.annotation.OptionsClass;
import org.icij.task.annotation.Task;
import java.io.*;
import java.nio.file.Paths;
/**
* A command that loads a {@link DocumentQueue} from JSON.
*
*
*/
@Task("Load a queue from a JSON or CSV dump file. If no source path is given then the input is read from standard " +
"input.")
@OptionsClass(DocumentQueueFactory.class)
@Option(name = "format", description = "The dump file format. Defaults to JSON.", parameter = "csv|json")
@Option(name = "pathField", description = "The name of CSV field to parse the path from.", parameter = "name")
public class LoadQueueTask extends DefaultTask {
@Override
public Void call() throws Exception {
final DocumentFactory factory = new DocumentFactory().configure(options);
try (final InputStream input = new CloseShieldInputStream(System.in);
final DocumentQueue queue = new DocumentQueueFactory(options)
.withDocumentFactory(factory)
.createShared()) {
load(factory, queue, input);
}
return null;
}
@Override
public Void call(final String[] arguments) throws Exception {
final DocumentFactory factory = new DocumentFactory().configure(options);
try (final DocumentQueue queue = new DocumentQueueFactory(options)
.withDocumentFactory(factory)
.createShared()) {
for (String argument : arguments) {
load(factory, queue, argument);
}
} catch (FileNotFoundException e) {
throw new RuntimeException("Unable to open dump file for reading.", e);
}
return null;
}
/**
* Load a dump file from the given path into a queue.
*
* @param queue the queue to load the dump into
* @param path the path to load the dump from
* @throws IOException if the dump could not be loaded
*/
private void load(final DocumentFactory factory, final DocumentQueue queue, final String path) throws IOException {
try (final InputStream input = new BufferedInputStream(new FileInputStream(path))) {
load(factory, queue, input);
}
}
/**
* Load a dump file from the given path into a queue.
*
* @param queue the queue to load the dump into
* @param input the input stream to load the dump from
* @throws IOException if the dump could not be loaded
*/
private void load(final DocumentFactory factory, final DocumentQueue queue, final InputStream input) throws
IOException {
final String format = options.get("format").value().orElse("json");
if (format.toLowerCase().equals("csv")) {
loadFromCSV(factory, queue, input);
} else {
loadFromJSON(factory, queue, input);
}
}
/**
* Load a dump file from the given input stream into a queue.
*
* @param queue the queue to load the dump into
* @param input the input stream to load the dump from
* @throws IOException if the dump could not be loaded
*/
private void loadFromCSV(final DocumentFactory factory, final DocumentQueue queue, final InputStream input) throws
IOException {
final String pathField = options.get("pathField").value().orElse("path");
for (CSVRecord record : CSVFormat.RFC4180.withHeader().parse(new InputStreamReader(input))) {
queue.add(Paths.get(record.get(pathField)));
}
}
/**
* Load a JSON dump file from the given input stream into a queue.
*
* @param queue the queue to load the dump into
* @param input the input stream to load the dump from
* @throws IOException if the dump could not be loaded
*/
private void loadFromJSON(final DocumentFactory factory, final DocumentQueue queue, final InputStream input)
throws IOException {
final ObjectMapper mapper = new ObjectMapper();
final SimpleModule module = new SimpleModule();
module.addDeserializer(DocumentQueue.class, new DocumentQueueDeserializer(factory, queue));
mapper.registerModule(module);
try (final JsonParser jsonParser = new JsonFactory().setCodec(mapper).createParser(input)) {
jsonParser.readValueAs(DocumentQueue.class);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy