All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.icij.extract.solr.SolrRehashConsumer Maven / Gradle / Ivy

There is a newer version: 7.4.0
Show newest version
package org.icij.extract.solr;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.icij.spewer.FieldNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.bind.DatatypeConverter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Objects;
import java.util.regex.Pattern;

/**
 * A consumer that recalculates ID hashes of documents after a simple
 * regular expression replacement on the path.
 *
 * This is useful when you want to change the paths of documents that
 * have already been added to Solr.
 *
 *
 */
public class SolrRehashConsumer extends SolrMachineConsumer {

	private static final Logger logger = LoggerFactory.getLogger(SolrRehashConsumer.class);

	private final SolrClient client;
	private final String idAlgorithm;
	private Pattern pattern = null;
	private String replacement = "";
	private Charset outputEncoding = StandardCharsets.UTF_8;
	private String pathField = FieldNames.DEFAULT_PATH_FIELD;

	public SolrRehashConsumer(final SolrClient client, final String idAlgorithm) {
		super();
		this.client = client;
		this.idAlgorithm = idAlgorithm;
	}

	public void setOutputEncoding(final Charset outputEncoding) {
		this.outputEncoding = outputEncoding;
	}

	public void setPathField(final String pathField) {
		this.pathField = pathField;
	}

	public void setPattern(final String pattern) {
		this.pattern = Pattern.compile(pattern);
	}

	public void setReplacement(final String replacement) {
		this.replacement = replacement;
	}

	@Override
	protected void consume(final SolrDocument input) throws SolrServerException, IOException, NoSuchAlgorithmException {
		final String inputPath = (String) input.getFieldValue(pathField);
		final String outputPath;

		if (null != pattern && null != replacement) {
			outputPath = pattern.matcher(inputPath).replaceAll(replacement);
		} else {
			outputPath = inputPath;
		}

		final String inputId = (String) input.getFieldValue(idField);
		final String outputId = DatatypeConverter.printHexBinary(MessageDigest.getInstance(idAlgorithm)
			.digest(outputPath.getBytes(outputEncoding)));
		final String outputPathParent = Objects.toString(Paths.get(outputPath).getParent(), "");

		// If the hash hasn't changed, skip.
		// Skip by comparing the hash values and not the paths because the algorithm might have been changed.
		if (inputId.equals(outputId)) {
			return;
		}

		final SolrInputDocument output = new SolrInputDocument();

		for (String name: input.getFieldNames()) {
			output.addField(name, input.getFieldValue(name));
		}

		output.setField("_version_", "-1"); // The document must not exist.
		output.setField(idField, outputId);
		output.setField(pathField, outputPath);
		output.setField(FieldNames.DEFAULT_PARENT_PATH_FIELD, outputPathParent);

		logger.info(String.format("Replacing path \"%s\" with \"%s\" and rehashing ID from \"%s\" to \"%s\".",
				inputPath, outputPath, inputId, outputId));
		client.add(output);
		client.deleteById(inputId);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy