All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.update.processor.RegexpBoostProcessor Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.update.processor;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A processor which will match content of "inputField" against regular expressions found in
 * "boostFilename", and if it matches will return the corresponding boost value from the file and
 * output this to "boostField" as a double value. If more than one pattern matches, the boosts from
 * each are multiplied.
 *
 * 

A typical use case may be to match a URL against patterns to boost or deboost web documents * based on the URL itself: * *

 * # Format of each line: <pattern><TAB><boost>
 * # Example:
 * https?://my.domain.com/temp.*  0.2
 * 
* *

Both inputField, boostField and boostFilename are mandatory parameters. */ public class RegexpBoostProcessor extends UpdateRequestProcessor { protected static final String INPUT_FIELD_PARAM = "inputField"; protected static final String BOOST_FIELD_PARAM = "boostField"; protected static final String BOOST_FILENAME_PARAM = "boostFilename"; private static final String DEFAULT_INPUT_FIELDNAME = "url"; private static final String DEFAULT_BOOST_FIELDNAME = "urlboost"; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private boolean enabled = true; private String inputFieldname = DEFAULT_INPUT_FIELDNAME; private String boostFieldname = DEFAULT_BOOST_FIELDNAME; private String boostFilename; private List boostEntries = new ArrayList<>(); private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries"; RegexpBoostProcessor( SolrParams parameters, SolrQueryRequest request, SolrQueryResponse response, UpdateRequestProcessor nextProcessor, final Map sharedObjectCache) { super(nextProcessor); this.initParameters(parameters); if (this.boostFilename == null) { log.warn("Null boost filename. Disabling processor."); setEnabled(false); } if (!isEnabled()) { return; } try { synchronized (sharedObjectCache) { @SuppressWarnings({"unchecked"}) List cachedBoostEntries = (List) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY); if (cachedBoostEntries == null) { log.debug("No pre-cached boost entry list found, initializing new"); InputStream is = request.getCore().getResourceLoader().openResource(boostFilename); cachedBoostEntries = initBoostEntries(is); sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries); } else { if (log.isDebugEnabled()) { log.debug("Using cached boost entry list with {} elements", cachedBoostEntries.size()); } } this.boostEntries = cachedBoostEntries; } } catch (IOException ioe) { log.warn( "IOException while initializing boost entries from file {}", this.boostFilename, ioe); } } private void initParameters(SolrParams parameters) { if (parameters != null) { this.setEnabled(parameters.getBool("enabled", true)); this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME); this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME); this.boostFilename = parameters.get(BOOST_FILENAME_PARAM); } } private List initBoostEntries(InputStream is) throws IOException { List newBoostEntries = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { String line; while ((line = reader.readLine()) != null) { // Remove comments line = line.replaceAll("\\s+#.*$", ""); line = line.replaceAll("^#.*$", ""); // Skip empty lines or comment lines if (line.trim().length() == 0) { continue; } String[] fields = line.split("\\s+"); if (fields.length == 2) { String regexp = fields[0]; String boost = fields[1]; newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost))); log.debug("Read regexp {} with boost {}", regexp, boost); } else { log.warn( "Malformed config input line: {} (expected 2 fields, got {} fields). Skipping entry.", line, fields.length); } } } return newBoostEntries; } @Override public void processAdd(AddUpdateCommand command) throws IOException { if (isEnabled()) { processBoost(command); } super.processAdd(command); } public void processBoost(AddUpdateCommand command) { SolrInputDocument document = command.getSolrInputDocument(); if (document.containsKey(inputFieldname)) { String value = (String) document.getFieldValue(inputFieldname); double boost = 1.0f; for (BoostEntry boostEntry : boostEntries) { if (boostEntry.getPattern().matcher(value).matches()) { if (log.isDebugEnabled()) { log.debug("Pattern match {} for {}", boostEntry.getPattern().pattern(), value); } boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000; } } document.setField(boostFieldname, boost); if (log.isDebugEnabled()) { log.debug("Value {}, applied to field {}", boost, boostFieldname); } } } public boolean isEnabled() { return enabled; } public void setEnabled(boolean enabled) { this.enabled = enabled; } private static class BoostEntry { private Pattern pattern; private double boost; public BoostEntry(Pattern pattern, double d) { this.pattern = pattern; this.boost = d; } public Pattern getPattern() { return pattern; } public double getBoost() { return boost; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy