org.apache.solr.update.processor.CloneFieldUpdateProcessorFactory Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Clones the values found in any matching source
field into a configured dest
*
field.
*
* The source
field(s) can be configured as either:
*
*
* - One or more
<str>
* - An
<arr>
of <str>
* - A
<lst>
containing {@link FieldMutatingUpdateProcessorFactory
* FieldMutatingUpdateProcessorFactory style selector arguments}
*
*
* The dest
field can be a single <str>
containing the literal
* name of a destination field, or it may be a <lst>
specifying a regex
* pattern
and a replacement
string. If the pattern + replacement option is used
* the pattern will be matched against all fields matched by the source selector, and the
* replacement string (including any capture groups specified from the pattern) will be evaluated a
* using {@link Matcher#replaceAll(String)} to generate the literal name of the destination field.
*
*
If the resolved dest
field already exists in the document, then the values from
* the source
fields will be added to it. The "boost" value associated with the
* dest
will not be changed, and any boost specified on the source
fields will
* be ignored. (If the dest
field did not exist prior to this processor, the newly
* created dest
field will have the default boost of 1.0)
*
*
In the example below:
*
*
* - The
category
field will be cloned into the category_s
field
* - Both the
authors
and editors
fields will be cloned into the
* contributors
field
* - Any field with a name ending in
_price
-- except for list_price
* -- will be cloned into the all_prices
* - Any field name beginning with feat and ending in s (i.e. feats or features) will be cloned
* into a field prefixed with key_ and not ending in s. (i.e. key_feat or key_feature)
*
*
*
*
*
* <updateRequestProcessorChain name="multiple-clones">
* <processor class="solr.CloneFieldUpdateProcessorFactory">
* <str name="source">category</str>
* <str name="dest">category_s</str>
* </processor>
* <processor class="solr.CloneFieldUpdateProcessorFactory">
* <arr name="source">
* <str>authors</str>
* <str>editors</str>
* </arr>
* <str name="dest">contributors</str>
* </processor>
* <processor class="solr.CloneFieldUpdateProcessorFactory">
* <lst name="source">
* <str name="fieldRegex">.*_price$</str>
* <lst name="exclude">
* <str name="fieldName">list_price</str>
* </lst>
* </lst>
* <str name="dest">all_prices</str>
* </processor>
* <processor class="solr.processor.CloneFieldUpdateProcessorFactory">
* <lst name="source">
* <str name="fieldRegex">^feat(.*)s$</str>
* </lst>
* <lst name="dest">
* <str name="pattern">^feat(.*)s$</str>
* <str name="replacement">key_feat$1</str>
* </str>
* </processor>
* </updateRequestProcessorChain>
*
*
* In common case situations where you wish to use a single regular expression as both a
* fieldRegex
selector and a destination pattern
, a "short hand" syntax is
* support for convinience: The pattern
and replacement
may be specified
* at the top level, omitting source
and dest
declarations completely, and
* the pattern
will be used to construct an equivalent source
selector
* internally.
*
*
For example, both of the following configurations are equivalent:
*
*
* <!-- full syntax -->
* <processor class="solr.processor.CloneFieldUpdateProcessorFactory">
* <lst name="source">
* <str name="fieldRegex"^gt;$feat(.*)s$</str>
* </lst>
* <lst name="dest">
* <str name="pattern">^feat(.*)s$</str>
* <str name="replacement">key_feat$1</str>
* </str>
* </processor>
*
* <!-- syntactic sugar syntax -->
* <processor class="solr.processor.CloneFieldUpdateProcessorFactory">
* <str name="pattern">^feat(.*)s$</str>
* <str name="replacement">key_feat$1</str>
* </processor>
*
*
* When cloning multiple fields (or a single multivalued field) into a single valued field, one
* of the {@link FieldValueSubsetUpdateProcessorFactory} implementations configured after the
* CloneFieldUpdateProcessorFactory
can be useful to reduce the list of values down to a
* single value.
*
* @see FieldValueSubsetUpdateProcessorFactory
* @since 4.0.0
*/
public class CloneFieldUpdateProcessorFactory extends UpdateRequestProcessorFactory
implements SolrCoreAware {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String SOURCE_PARAM = "source";
public static final String DEST_PARAM = "dest";
public static final String PATTERN_PARAM = "pattern";
public static final String REPLACEMENT_PARAM = "replacement";
private SelectorParams srcInclusions = new SelectorParams();
private Collection srcExclusions = new ArrayList<>();
private FieldNameSelector srcSelector = null;
/**
* If pattern is null, this this is a literal field name. If pattern is non-null then this is a
* replacement string that may contain meta-characters (ie: capture group identifiers)
*
* @see #pattern
*/
private String dest = null;
/**
* @see #dest
*/
private Pattern pattern = null;
@SuppressWarnings("WeakerAccess")
protected final FieldNameSelector getSourceSelector() {
if (null != srcSelector) return srcSelector;
throw new SolrException(
SERVER_ERROR, "selector was never initialized, " + " inform(SolrCore) never called???");
}
@Override
public void init(NamedList args) {
// high level (loose) check for which type of config we have.
//
// individual init methods do more strict syntax checking
if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0)) {
initSourceSelectorSyntax(args);
} else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
initSimpleRegexReplacement(args);
} else {
throw new SolrException(
SERVER_ERROR,
"A combination of either '"
+ SOURCE_PARAM
+ "' + '"
+ DEST_PARAM
+ "', or '"
+ REPLACEMENT_PARAM
+ "' + '"
+ PATTERN_PARAM
+ "' init params are mandatory");
}
if (0 < args.size()) {
throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
}
super.init(args);
}
/**
* init helper method that should only be called when we know for certain that both the "source"
* and "dest" init params do not exist.
*/
private void initSimpleRegexReplacement(NamedList args) {
// The syntactic sugar for the case where there is only one regex pattern for source and the
// same pattern is used for the destination pattern...
//
// pattern != null && replacement != null
//
// ...as top level elements, with no other config options specified
// if we got here we know we had pattern and replacement, now check for the other two so that
// we can give a better message than "unexpected"
if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0)) {
throw new SolrException(
SERVER_ERROR,
"Short hand syntax must not be mixed with full syntax. Found "
+ PATTERN_PARAM
+ " and "
+ REPLACEMENT_PARAM
+ " but also found "
+ SOURCE_PARAM
+ " or "
+ DEST_PARAM);
}
assert args.indexOf(SOURCE_PARAM, 0) < 0;
Object patt = args.remove(PATTERN_PARAM);
Object replacement = args.remove(REPLACEMENT_PARAM);
if (null == patt || null == replacement) {
throw new SolrException(
SERVER_ERROR,
"Init params '"
+ PATTERN_PARAM
+ "' and '"
+ REPLACEMENT_PARAM
+ "' are both mandatory if '"
+ SOURCE_PARAM
+ "' and '"
+ DEST_PARAM
+ "' are not both specified");
}
if (0 != args.size()) {
throw new SolrException(
SERVER_ERROR,
"Init params '"
+ REPLACEMENT_PARAM
+ "' and '"
+ PATTERN_PARAM
+ "' must be children of '"
+ DEST_PARAM
+ "' to be combined with other options.");
}
if (!(replacement instanceof String)) {
throw new SolrException(
SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. )");
}
if (!(patt instanceof String)) {
throw new SolrException(
SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. )");
}
dest = replacement.toString();
try {
this.pattern = Pattern.compile(patt.toString());
} catch (PatternSyntaxException pe) {
throw new SolrException(
SERVER_ERROR,
"Init param " + PATTERN_PARAM + " is not a valid regex pattern: " + patt,
pe);
}
srcInclusions = new SelectorParams();
srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
}
/**
* init helper method that should only be called when we know for certain that both the "source"
* and "dest" init params do exist.
*/
private void initSourceSelectorSyntax(NamedList args) {
// Full and complete syntax where source and dest are mandatory.
//
// source may be a single string or a selector.
// dest may be a single string or list containing pattern and replacement
//
// source != null && dest != null
// if we got here we know we had source and dest, now check for the other two so that we can
// give a better message than "unexpected"
if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
throw new SolrException(
SERVER_ERROR,
"Short hand syntax must not be mixed with full syntax. Found "
+ SOURCE_PARAM
+ " and "
+ DEST_PARAM
+ " but also found "
+ PATTERN_PARAM
+ " or "
+ REPLACEMENT_PARAM);
}
Object d = args.remove(DEST_PARAM);
assert null != d;
List sources = args.getAll(SOURCE_PARAM);
assert null != sources;
if (1 == sources.size()) {
if (sources.get(0) instanceof NamedList) {
// nested set of selector options
NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
srcInclusions = parseSelectorParams(selectorConfig);
List excList = selectorConfig.getAll("exclude");
for (Object excObj : excList) {
if (null == excObj) {
throw new SolrException(
SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child 'exclude' can not be null");
}
if (!(excObj instanceof NamedList)) {
throw new SolrException(
SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child 'exclude' must be ");
}
NamedList exc = (NamedList) excObj;
srcExclusions.add(parseSelectorParams(exc));
if (0 < exc.size()) {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ SOURCE_PARAM
+ "' has unexpected 'exclude' sub-param(s): '"
+ selectorConfig.getName(0)
+ "'");
}
// call once per instance
selectorConfig.remove("exclude");
}
if (0 < selectorConfig.size()) {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ SOURCE_PARAM
+ "' contains unexpected child param(s): '"
+ selectorConfig.getName(0)
+ "'");
}
// consume from the named list so it doesn't interfere with subsequent processing
sources.remove(0);
}
}
if (1 <= sources.size()) {
// source better be one or more strings
srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
}
if (srcInclusions == null) {
throw new SolrException(
SERVER_ERROR,
"Init params do not specify anything to clone, please supply either "
+ SOURCE_PARAM
+ " and "
+ DEST_PARAM
+ " or "
+ PATTERN_PARAM
+ " and "
+ REPLACEMENT_PARAM
+ ". See javadocs"
+ "for CloneFieldUpdateProcessorFactory for further details.");
}
if (d instanceof NamedList) {
NamedList destList = (NamedList) d;
Object patt = destList.remove(PATTERN_PARAM);
Object replacement = destList.remove(REPLACEMENT_PARAM);
if (null == patt || null == replacement) {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ DEST_PARAM
+ "' children '"
+ PATTERN_PARAM
+ "' and '"
+ REPLACEMENT_PARAM
+ "' are both mandatoryand can not be null");
}
if (!(patt instanceof String && replacement instanceof String)) {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ DEST_PARAM
+ "' children '"
+ PATTERN_PARAM
+ "' and '"
+ REPLACEMENT_PARAM
+ "' must both be strings (i.e. )");
}
if (0 != destList.size()) {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ DEST_PARAM
+ "' has unexpected children: '"
+ destList.getName(0)
+ "'");
}
try {
this.pattern = Pattern.compile(patt.toString());
} catch (PatternSyntaxException pe) {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ DEST_PARAM
+ "' child '"
+ PATTERN_PARAM
+ " is not a valid regex pattern: "
+ patt,
pe);
}
dest = replacement.toString();
} else if (d instanceof String) {
dest = d.toString();
} else {
throw new SolrException(
SERVER_ERROR,
"Init param '"
+ DEST_PARAM
+ "' must either be a string "
+ "(i.e. ) or a list (i.e. ) containing '"
+ PATTERN_PARAM
+ "' and '"
+ REPLACEMENT_PARAM);
}
}
@Override
public void inform(final SolrCore core) {
srcSelector =
FieldMutatingUpdateProcessor.createFieldNameSelector(
core.getResourceLoader(),
core,
srcInclusions,
FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
for (SelectorParams exc : srcExclusions) {
srcSelector =
FieldMutatingUpdateProcessor.wrap(
srcSelector,
FieldMutatingUpdateProcessor.createFieldNameSelector(
core.getResourceLoader(),
core,
exc,
FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
}
}
@Override
public final UpdateRequestProcessor getInstance(
SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
final FieldNameSelector srcSelector = getSourceSelector();
return new UpdateRequestProcessor(next) {
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
final SolrInputDocument doc = cmd.getSolrInputDocument();
// destination may be regex replace string, which can cause multiple output fields.
Map destMap = new HashMap<>();
// preserve initial values and boost (if any)
for (final String fname : doc.getFieldNames()) {
if (!srcSelector.shouldMutate(fname)) continue;
Collection