All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.update.processor.CloneFieldUpdateProcessorFactory Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.update.processor;

import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Clones the values found in any matching source field into 
 * a configured dest field.
 * 

* The source field(s) can be configured as either: *

*
    *
  • One or more <str>
  • *
  • An <arr> of <str>
  • *
  • A <lst> containing {@link FieldMutatingUpdateProcessorFactory FieldMutatingUpdateProcessorFactory style selector arguments}
  • *
* *

The dest field can be a single <str> * containing the literal name of a destination field, or it may be a <lst> specifying a * regex pattern and a replacement string. If the pattern + replacement option * is used the pattern will be matched against all fields matched by the source selector, and the replacement * string (including any capture groups specified from the pattern) will be evaluated a using * {@link Matcher#replaceAll(String)} to generate the literal name of the destination field. *

* *

If the resolved dest field already exists in the document, then the * values from the source fields will be added to it. The * "boost" value associated with the dest will not be changed, * and any boost specified on the source fields will be ignored. * (If the dest field did not exist prior to this processor, the * newly created dest field will have the default boost of 1.0) *

*

* In the example below: *

*
    *
  • The category field will be cloned into the category_s field
  • *
  • Both the authors and editors fields will be cloned into the * contributors field *
  • *
  • Any field with a name ending in _price -- except for * list_price -- will be cloned into the all_prices *
  • *
  • Any field name beginning with feat and ending in s (i.e. feats or features) * will be cloned into a field prefixed with key_ and not ending in s. (i.e. key_feat or key_feature) *
  • *
* * *
 *   <updateRequestProcessorChain name="multiple-clones">
 *     <processor class="solr.CloneFieldUpdateProcessorFactory">
 *       <str name="source">category</str>
 *       <str name="dest">category_s</str>
 *     </processor>
 *     <processor class="solr.CloneFieldUpdateProcessorFactory">
 *       <arr name="source">
 *         <str>authors</str>
 *         <str>editors</str>
 *       </arr>
 *       <str name="dest">contributors</str>
 *     </processor>
 *     <processor class="solr.CloneFieldUpdateProcessorFactory">
 *       <lst name="source">
 *         <str name="fieldRegex">.*_price$</str>
 *         <lst name="exclude">
 *           <str name="fieldName">list_price</str>
 *         </lst>
 *       </lst>
 *       <str name="dest">all_prices</str>
 *     </processor>
 *     <processor class="solr.processor.CloneFieldUpdateProcessorFactory">
 *       <lst name="source">
 *         <str name="fieldRegex">^feat(.*)s$</str>
 *       </lst>
 *       <lst name="dest">
 *         <str name="pattern">^feat(.*)s$</str>
 *         <str name="replacement">key_feat$1</str>
 *       </str>
 *     </processor>
 *   </updateRequestProcessorChain>
 * 
* *

* In common case situations where you wish to use a single regular expression as both a * fieldRegex selector and a destination pattern, a "short hand" syntax * is support for convinience: The pattern and replacement may be specified * at the top level, omitting source and dest declarations completely, and * the pattern will be used to construct an equivalent source selector internally. *

*

* For example, both of the following configurations are equivalent: *

*
 * <!-- full syntax -->
 * <processor class="solr.processor.CloneFieldUpdateProcessorFactory">
 *   <lst name="source">
 *     <str name="fieldRegex"^gt;$feat(.*)s$</str>
 *   </lst>
 *   <lst name="dest">
 *     <str name="pattern">^feat(.*)s$</str>
 *     <str name="replacement">key_feat$1</str>
 *   </str>
 * </processor>
 * 
 * <!-- syntactic sugar syntax -->
 * <processor class="solr.processor.CloneFieldUpdateProcessorFactory">
 *   <str name="pattern">^feat(.*)s$</str>
 *   <str name="replacement">key_feat$1</str>
 * </processor>
 * 
* *

* When cloning multiple fields (or a single multivalued field) into a single valued field, one of the * {@link FieldValueSubsetUpdateProcessorFactory} implementations configured after the * CloneFieldUpdateProcessorFactory can be useful to reduce the list of values down to a * single value. *

* * @see FieldValueSubsetUpdateProcessorFactory * @since 4.0.0 */ public class CloneFieldUpdateProcessorFactory extends UpdateRequestProcessorFactory implements SolrCoreAware { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); public static final String SOURCE_PARAM = "source"; public static final String DEST_PARAM = "dest"; public static final String PATTERN_PARAM = "pattern"; public static final String REPLACEMENT_PARAM = "replacement"; private SelectorParams srcInclusions = new SelectorParams(); private Collection srcExclusions = new ArrayList<>(); private FieldNameSelector srcSelector = null; /** * If pattern is null, this this is a literal field name. If pattern is non-null then this * is a replacement string that may contain meta-characters (ie: capture group identifiers) * @see #pattern */ private String dest = null; /** @see #dest */ private Pattern pattern = null; @SuppressWarnings("WeakerAccess") protected final FieldNameSelector getSourceSelector() { if (null != srcSelector) return srcSelector; throw new SolrException(SERVER_ERROR, "selector was never initialized, "+ " inform(SolrCore) never called???"); } @SuppressWarnings("unchecked") @Override public void init(@SuppressWarnings({"rawtypes"})NamedList args) { // high level (loose) check for which type of config we have. // // individual init methods do more strict syntax checking if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) { initSourceSelectorSyntax(args); } else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) { initSimpleRegexReplacement(args); } else { throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+ DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" + PATTERN_PARAM + "' init params are mandatory"); } if (0 < args.size()) { throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'"); } super.init(args); } /** * init helper method that should only be called when we know for certain that both the * "source" and "dest" init params do not exist. */ @SuppressWarnings("unchecked") private void initSimpleRegexReplacement(@SuppressWarnings({"rawtypes"})NamedList args) { // The syntactic sugar for the case where there is only one regex pattern for source and the same pattern // is used for the destination pattern... // // pattern != null && replacement != null // // ...as top level elements, with no other config options specified // if we got here we know we had pattern and replacement, now check for the other two so that we can give a better // message than "unexpected" if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) { throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM); } assert args.indexOf(SOURCE_PARAM, 0) < 0; Object patt = args.remove(PATTERN_PARAM); Object replacement = args.remove(REPLACEMENT_PARAM); if (null == patt || null == replacement) { throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+ DEST_PARAM + "' are not both specified"); } if (0 != args.size()) { throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" + PATTERN_PARAM + "' must be children of '" + DEST_PARAM + "' to be combined with other options."); } if (!(replacement instanceof String)) { throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. )"); } if (!(patt instanceof String)) { throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. )"); } dest = replacement.toString(); try { this.pattern = Pattern.compile(patt.toString()); } catch (PatternSyntaxException pe) { throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM + " is not a valid regex pattern: " + patt, pe); } srcInclusions = new SelectorParams(); srcInclusions.fieldRegex = Collections.singletonList(this.pattern); } /** * init helper method that should only be called when we know for certain that both the * "source" and "dest" init params do exist. */ @SuppressWarnings("unchecked") private void initSourceSelectorSyntax(@SuppressWarnings({"rawtypes"})NamedList args) { // Full and complete syntax where source and dest are mandatory. // // source may be a single string or a selector. // dest may be a single string or list containing pattern and replacement // // source != null && dest != null // if we got here we know we had source and dest, now check for the other two so that we can give a better // message than "unexpected" if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) { throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " + SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM); } Object d = args.remove(DEST_PARAM); assert null != d; List sources = args.getAll(SOURCE_PARAM); assert null != sources; if (1 == sources.size()) { if (sources.get(0) instanceof NamedList) { // nested set of selector options @SuppressWarnings({"rawtypes"}) NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM); srcInclusions = parseSelectorParams(selectorConfig); List excList = selectorConfig.getAll("exclude"); for (Object excObj : excList) { if (null == excObj) { throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child 'exclude' can not be null"); } if (!(excObj instanceof NamedList)) { throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child 'exclude' must be "); } @SuppressWarnings({"rawtypes"}) NamedList exc = (NamedList) excObj; srcExclusions.add(parseSelectorParams(exc)); if (0 < exc.size()) { throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' has unexpected 'exclude' sub-param(s): '" + selectorConfig.getName(0) + "'"); } // call once per instance selectorConfig.remove("exclude"); } if (0 < selectorConfig.size()) { throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' contains unexpected child param(s): '" + selectorConfig.getName(0) + "'"); } // consume from the named list so it doesn't interfere with subsequent processing sources.remove(0); } } if (1 <= sources.size()) { // source better be one or more strings srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source")); } if (srcInclusions == null) { throw new SolrException(SERVER_ERROR, "Init params do not specify anything to clone, please supply either " + SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" + "for CloneFieldUpdateProcessorFactory for further details."); } if (d instanceof NamedList) { @SuppressWarnings({"rawtypes"}) NamedList destList = (NamedList) d; Object patt = destList.remove(PATTERN_PARAM); Object replacement = destList.remove(REPLACEMENT_PARAM); if (null == patt || null == replacement) { throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" + PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM + "' are both mandatoryand can not be null"); } if (! (patt instanceof String && replacement instanceof String)) { throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" + PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM + "' must both be strings (i.e. )"); } if (0 != destList.size()) { throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '" + destList.getName(0) + "'"); } try { this.pattern = Pattern.compile(patt.toString()); } catch (PatternSyntaxException pe) { throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM + " is not a valid regex pattern: " + patt, pe); } dest = replacement.toString(); } else if (d instanceof String) { dest = d.toString(); } else { throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " + "(i.e. ) or a list (i.e. ) containing '" + PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM); } } @Override public void inform(final SolrCore core) { srcSelector = FieldMutatingUpdateProcessor.createFieldNameSelector (core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS); for (SelectorParams exc : srcExclusions) { srcSelector = FieldMutatingUpdateProcessor.wrap (srcSelector, FieldMutatingUpdateProcessor.createFieldNameSelector (core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS)); } } @Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { final FieldNameSelector srcSelector = getSourceSelector(); return new UpdateRequestProcessor(next) { @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // destination may be regex replace string, which can cause multiple output fields. Map destMap = new HashMap<>(); // preserve initial values and boost (if any) for (final String fname : doc.getFieldNames()) { if (! srcSelector.shouldMutate(fname)) continue; Collection srcFieldValues = doc.getFieldValues(fname); if(srcFieldValues == null || srcFieldValues.isEmpty()) continue; String resolvedDest = dest; if (pattern != null) { Matcher matcher = pattern.matcher(fname); if (matcher.find()) { resolvedDest = matcher.replaceAll(dest); } else { if (log.isDebugEnabled()) { log.debug("CloneFieldUpdateProcessor.srcSelector.shouldMutate('{}') returned true, but replacement pattern did not match, field skipped." , fname); } continue; } } SolrInputField destField; if (doc.containsKey(resolvedDest)) { destField = doc.getField(resolvedDest); } else { SolrInputField targetField = destMap.get(resolvedDest); if (targetField == null) { destField = new SolrInputField(resolvedDest); } else { destField = targetField; } } for (Object val : srcFieldValues) { destField.addValue(val); } // put it in map to avoid concurrent modification... destMap.put(resolvedDest, destField); } for (Map.Entry entry : destMap.entrySet()) { doc.put(entry.getKey(), entry.getValue()); } super.processAdd(cmd); } }; } /** macro */ private static SelectorParams parseSelectorParams(@SuppressWarnings({"rawtypes"})NamedList args) { return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args); } }