
org.icij.extract.tasks.CopyTask Maven / Gradle / Ivy
package org.icij.extract.tasks;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.icij.extract.IndexType;
import org.icij.extract.solr.SolrCopyConsumer;
import org.icij.extract.solr.SolrMachine;
import org.icij.extract.solr.SolrMachineConsumer;
import org.icij.extract.solr.SolrMachineProducer;
import org.icij.spewer.http.PinnedHttpClientBuilder;
import org.icij.task.MonitorableTask;
import org.icij.task.annotation.Option;
import org.icij.task.annotation.Task;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
/**
* Copy index fields.
*
*
*/
@Task("Copy Solr fields from one field to another, or back to the same field to force reindexing.\n\n" +
"Both literal mappings and wildcards are supported, for example \"field_a:field_b\" and \"field_*\".")
@Option(name = "indexType", description = "Specify the index type. For now, the only valid value is " +
"\"solr\" (the default).", parameter = "type")
@Option(name = "address", description = "Index core API endpoint address.", code = "s", parameter = "url")
@Option(name = "serverCertificate", description = "The index server's public certificate, used for " +
"certificate pinning. Supported formats are PEM, DER, PKCS #12 and JKS.", parameter = "path")
@Option(name = "verifyHost", description = "Verify the index server's public certificate against the " +
"specified host. Use the wildcard \"*\" to disable verification.", parameter = "hostname")
@Option(name = "commit", description = "Perform a commit when done.", code = "c")
@Option(name = "softCommit", description = "Performs a soft commit. Makes index changes visible while " +
"neither fsync-ing index files nor writing a new index descriptor. This could lead to data loss if Solr is " +
"terminated unexpectedly.")
@Option(name = "idField", description = "Index field for an automatically generated identifier. The ID " +
"for the same file is guaranteed not to change if the path doesn't change. Defaults to \"id\".", code = "i",
parameter = "name")
@Option(name = "filter", description = "Filter for documents to copy.", code = "f", parameter = "query")
@Option(name = "jobs", description = "The number of documents to process at a time. Defaults to the number" +
" of available processors", parameter = "number")
public class CopyTask extends MonitorableTask {
/**
* The default number of jobs to run.
*/
private static final int DEFAULT_JOBS = Runtime.getRuntime().availableProcessors();
@Override
public Long call(final String[] mappings) throws Exception {
if (null == mappings || 0 == mappings.length) {
throw new IllegalArgumentException("You must pass the field mappings on the command line.");
}
final Map map = new HashMap<>();
final int jobs = options.get("jobs").parse().asInteger().orElse(DEFAULT_JOBS);
final IndexType indexType = options.get("indexType").parse().asEnum(IndexType::parse).orElse(IndexType.SOLR);
for (String mapping : mappings) {
String[] fields = mapping.split(":", 2);
if (fields.length > 1) {
map.put(fields[0], fields[1]);
} else {
map.put(fields[0], null);
}
}
if (IndexType.SOLR == indexType) {
return copySolr(map, jobs);
} else {
throw new IllegalStateException("Not implemented.");
}
}
@Override
public Long call() throws Exception {
return call(null);
}
/**
* Copy the fields of a Solr index.
*/
private Long copySolr(final Map map, final int jobs) throws Exception {
try (
final CloseableHttpClient httpClient = PinnedHttpClientBuilder.createWithDefaults()
.setVerifyHostname(options.get("verifyHost").value().orElse(null))
.pinCertificate(options.get("serverCertificate").value().orElse(null))
.build();
final SolrClient client = new HttpSolrClient.Builder(options.get("address").value().orElse
("http://127.0.0.1:8983/solr/"))
.withHttpClient(httpClient)
.build()
) {
final SolrMachineConsumer consumer = new SolrCopyConsumer(client, map);
final SolrMachineProducer producer = new SolrMachineProducer(client, map.keySet(), jobs);
final SolrMachine machine = new SolrMachine(consumer, producer, jobs);
consumer.setNotifiable(monitor);
producer.setNotifiable(monitor);
final Optional idField = options.get("idField").value();
if (idField.isPresent()) {
consumer.setIdField(idField.get());
producer.setIdField(idField.get());
}
options.get("filter").value().ifPresent(producer::setFilter);
final Long copied = machine.call();
machine.terminate();
if (options.get("softCommit").parse().isOn()) {
client.commit(true, true, true);
} else if (options.get("commit").parse().isOn()) {
client.commit(true, true, false);
}
return copied;
} catch (SolrServerException e) {
throw new RuntimeException("Unable to copy.", e);
} catch (IOException e) {
throw new RuntimeException("Unable to copy because of an error while communicating with Solr.", e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy