org.apache.solr.update.processor.DocExpirationUpdateProcessorFactory Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import static org.apache.solr.common.SolrException.ErrorCode.BAD_REQUEST;
import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.SolrNamedThreadFactory;
import org.apache.solr.core.CloseHook;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.security.PKIAuthenticationPlugin;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.DeleteUpdateCommand;
import org.apache.solr.util.DateMathParser;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Update Processor Factory for managing automatic "expiration" of documents.
*
* The DocExpirationUpdateProcessorFactory
provides two features related to the
* "expiration" of documents which can be used individually, or in combination:
*
*
* - Computing expiration field values for documents from a "time to live" (TTL)
*
- Periodically delete documents from the index based on an expiration field
*
*
* Documents with expiration field values computed from a TTL can be be excluded from searchers
* using simple date based filters relative to NOW
, or completely removed from the
* index using the periodic delete function of this factory. Alternatively, the periodic delete
* function of this factory can be used to remove any document with an expiration value - even if
* that expiration was explicitly set with-out leveraging the TTL feature of this factory.
*
*
The following configuration options are supported:
*
*
* expirationFieldName
- The name of the expiration field to use in any
* operations (mandatory).
* ttlFieldName
- Name of a field this process should look for in each document
* processed, defaulting to _ttl_
. If the specified field name exists in a
* document, the document field value will be parsed as a {@linkplain DateMathParser Date Math
* Expression} relative to NOW
and the result will be added to the document using
* the expirationFieldName
. Use <null name="ttlFieldName"/>
to
* disable this feature.
* ttlParamName
- Name of an update request param this process should look for in
* each request when processing document additions, defaulting to _ttl_
. If the
* specified param name exists in an update request, the param value will be parsed as a
* {@linkplain DateMathParser Date Math Expression} relative to NOW
and the
* result will be used as a default for any document included in that request that does not
* already have a value in the field specified by ttlFieldName
. Use
* <null name="ttlParamName"/>
to disable this feature.
* autoDeletePeriodSeconds
- Optional numeric value indicating how often this
* factory should trigger a delete to remove documents. If this option is used, and specifies
* a non-negative numeric value, a background thread will be created that will execute
* recurring deleteByQuery
commands using the specified period. The delete query
* will remove all documents with an expirationFieldName
up to NOW
.
* autoDeleteChainName
- Optional name of an updateRequestProcessorChain
*
to use when executing automatic deletes. If not specified, or <null/>
*
, the default updateRequestProcessorChain
for this collection is used.
* This option is ignored unless autoDeletePeriodSeconds
is configured and is
* non-negative.
*
*
* For example: The configuration below will cause any document with a field named _ttl_
*
to have a Date field named _expire_at_
computed for it when added -- but no
* automatic deletion will happen.
*
*
* <processor class="solr.processor.DocExpirationUpdateProcessorFactory">
* <str name="expirationFieldName">_expire_at_</str>
* </processor>
*
* Alternatively, in this configuration deletes will occur automatically against the
* _expire_at_
field every 5 minutes - but this processor will not automatically populate the
* _expire_at_
using any sort of TTL expression. Only documents that were added with an
* explicit _expire_at_
field value will ever be deleted.
*
*
* <processor class="solr.processor.DocExpirationUpdateProcessorFactory">
* <null name="ttlFieldName"/>
* <null name="ttlParamName"/>
* <int name="autoDeletePeriodSeconds">300</int>
* <str name="expirationFieldName">_expire_at_</str>
* </processor>
*
* This last example shows the combination of both features using a custom ttlFieldName
*
: Documents with a my_ttl
field will have an _expire_at_
field
* computed, and deletes will be triggered every 5 minutes to remove documents whose
* _expire_at_
field value is in the past.
*
*
* <processor class="solr.processor.DocExpirationUpdateProcessorFactory">
* <int name="autoDeletePeriodSeconds">300</int>
* <str name="ttlFieldName">my_ttl</str>
* <null name="ttlParamName"/>
* <str name="expirationFieldName">_expire_at_</str>
* </processor>
*
* @since 4.8.0
*/
public final class DocExpirationUpdateProcessorFactory extends UpdateRequestProcessorFactory
implements SolrCoreAware {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String DEF_TTL_KEY = "_ttl_";
private static final String EXP_FIELD_NAME_CONF = "expirationFieldName";
private static final String TTL_FIELD_NAME_CONF = "ttlFieldName";
private static final String TTL_PARAM_NAME_CONF = "ttlParamName";
private static final String DEL_CHAIN_NAME_CONF = "autoDeleteChainName";
private static final String DEL_PERIOD_SEC_CONF = "autoDeletePeriodSeconds";
private SolrCore core;
private ScheduledThreadPoolExecutor executor;
private String expireField = null;
private String ttlField = null;
private String ttlParam = null;
private String deleteChainName = null;
private long deletePeriodSeconds = -1L;
private SolrException confErr(final String msg) {
return confErr(msg, null);
}
private SolrException confErr(final String msg, SolrException root) {
return new SolrException(SERVER_ERROR, this.getClass().getSimpleName() + ": " + msg, root);
}
private String removeArgStr(
final NamedList args, final String arg, final String def, final String errMsg) {
if (args.indexOf(arg, 0) < 0) return def;
Object tmp = args.remove(arg);
if (null == tmp) return null;
if (tmp instanceof String) return tmp.toString();
throw confErr(arg + " " + errMsg);
}
@Override
public void init(NamedList args) {
deleteChainName =
removeArgStr(
args, DEL_CHAIN_NAME_CONF, null, "must be a or for default chain");
ttlField =
removeArgStr(
args, TTL_FIELD_NAME_CONF, DEF_TTL_KEY, "must be a or to disable");
ttlParam =
removeArgStr(
args, TTL_PARAM_NAME_CONF, DEF_TTL_KEY, "must be a or to disable");
expireField = removeArgStr(args, EXP_FIELD_NAME_CONF, null, "must be a ");
if (null == expireField) {
throw confErr(EXP_FIELD_NAME_CONF + " must be configured");
}
Object tmp = args.remove(DEL_PERIOD_SEC_CONF);
if (null != tmp) {
if (!(tmp instanceof Number)) {
throw confErr(DEL_PERIOD_SEC_CONF + " must be an or ");
}
deletePeriodSeconds = ((Number) tmp).longValue();
}
super.init(args);
}
@Override
public void inform(SolrCore core) {
this.core = core;
if (null == core.getLatestSchema().getFieldTypeNoEx(expireField)) {
// TODO: check for managed schema and auto-add as a date field?
throw confErr(EXP_FIELD_NAME_CONF + " does not exist in schema: " + expireField);
}
if (0 < deletePeriodSeconds) {
// validate that we have a chain we can work with
try {
Object ignored = core.getUpdateProcessingChain(deleteChainName);
} catch (SolrException e) {
throw confErr(DEL_CHAIN_NAME_CONF + " does not exist: " + deleteChainName, e);
}
// schedule recurring deletion
initDeleteExpiredDocsScheduler(core);
}
}
private void initDeleteExpiredDocsScheduler(SolrCore core) {
executor =
new ScheduledThreadPoolExecutor(
1,
new SolrNamedThreadFactory("autoExpireDocs"),
new RejectedExecutionHandler() {
@Override
public void rejectedExecution(Runnable r, ThreadPoolExecutor e) {
log.warn("Skipping execution of '{}' using '{}'", r, e);
}
});
core.addCloseHook(
new CloseHook() {
@Override
public void postClose(SolrCore core) {
// update handler is gone, terminate anything that's left.
if (executor.isTerminating()) {
log.info("Waiting for close of DocExpiration Executor");
ExecutorUtil.shutdownAndAwaitTermination(executor);
}
}
@Override
public void preClose(SolrCore core) {
log.info("Triggering Graceful close of DocExpiration Executor");
executor.shutdown();
}
});
executor.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
executor.setContinueExistingPeriodicTasksAfterShutdownPolicy(false);
// we don't want this firing right away, since the core may not be ready
final long initialDelay = deletePeriodSeconds;
// TODO: should we make initialDelay configurable
// TODO: should we make initialDelay some fraction of the period?
executor.scheduleAtFixedRate(
new DeleteExpiredDocsRunnable(this),
deletePeriodSeconds,
deletePeriodSeconds,
TimeUnit.SECONDS);
}
@Override
public UpdateRequestProcessor getInstance(
SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
String defaultTtl = (null == ttlParam) ? null : req.getParams().get(ttlParam);
if (null == ttlField && null == defaultTtl) {
// nothing to do, shortcircut ourselves out of the chain.
return next;
} else {
return new TTLUpdateProcessor(defaultTtl, expireField, ttlField, next);
}
}
private static final class TTLUpdateProcessor extends UpdateRequestProcessor {
final String defaultTtl;
final String expireField;
final String ttlField;
public TTLUpdateProcessor(
final String defaultTtl,
final String expireField,
final String ttlField,
final UpdateRequestProcessor next) {
super(next);
this.defaultTtl = defaultTtl;
this.expireField = expireField;
this.ttlField = ttlField;
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
final SolrInputDocument doc = cmd.getSolrInputDocument();
final String math =
doc.containsKey(ttlField) ? doc.getFieldValue(ttlField).toString() : defaultTtl;
if (null != math) {
try {
final DateMathParser dmp = new DateMathParser();
// TODO: should we try to accept things like "1DAY" as well as "+1DAY" ?
// How?
// 'startsWith("+")' is a bad idea because it would cause problems with
// things like "/DAY+1YEAR"
// Maybe catch ParseException and retry with "+" prepended?
doc.addField(expireField, dmp.parseMath(math));
} catch (ParseException pe) {
throw new SolrException(BAD_REQUEST, "Can't parse ttl as date math: " + math, pe);
}
}
super.processAdd(cmd);
}
}
/**
* Runnable that uses the deleteChainName
configured for this factory to execute a
* delete by query (using the configured expireField
) followed by a soft commit to
* re-open searchers (if needed)
*
* This logic is all wrapped up in a new SolrRequestInfo context with some logging to help make
* it obvious this background activity is happening.
*
*
In cloud mode, this runner only triggers deletes if {@link #iAmInChargeOfPeriodicDeletes} is
* true. (logging is minimal in this situation)
*
* @see #iAmInChargeOfPeriodicDeletes
*/
private static final class DeleteExpiredDocsRunnable implements Runnable {
final DocExpirationUpdateProcessorFactory factory;
final SolrCore core;
final String deleteChainName;
final String expireField;
public DeleteExpiredDocsRunnable(final DocExpirationUpdateProcessorFactory factory) {
this.factory = factory;
this.core = factory.core;
this.deleteChainName = factory.deleteChainName;
this.expireField = factory.expireField;
}
@Override
public void run() {
// setup the request context early so the logging (including any from
// shouldWeDoPeriodicDelete() ) includes the core context info
final LocalSolrQueryRequest req =
new LocalSolrQueryRequest(factory.core, Collections.emptyMap());
try {
// HACK: to indicate to PKI that this is a server initiated request for the purposes
// of distributed requet/credential forwarding...
req.setUserPrincipalName(PKIAuthenticationPlugin.NODE_IS_USER);
final SolrQueryResponse rsp = new SolrQueryResponse();
rsp.addResponseHeader(new SimpleOrderedMap<>(1));
SolrRequestInfo.setRequestInfo(new SolrRequestInfo(req, rsp));
try {
if (!factory.iAmInChargeOfPeriodicDeletes()) {
// No-Op
return;
}
if (log.isInfoEnabled()) {
log.info("Beginning periodic deletion of expired docs on core: {}", core.getName());
}
UpdateRequestProcessorChain chain = core.getUpdateProcessingChain(deleteChainName);
UpdateRequestProcessor proc = chain.createProcessor(req, rsp);
if (null == proc) {
log.warn(
"No active processors, skipping automatic deletion of expired docs using chain: {}",
deleteChainName);
return;
}
try {
DeleteUpdateCommand del = new DeleteUpdateCommand(req);
del.setQuery(
"{!cache=false}"
+ expireField
+ ":[* TO "
+ SolrRequestInfo.getRequestInfo().getNOW().toInstant()
+ "]");
proc.processDelete(del);
// TODO: should this be more configurable?
// TODO: in particular: should hard commit be optional?
CommitUpdateCommand commit = new CommitUpdateCommand(req, false);
commit.softCommit = true;
commit.openSearcher = true;
proc.processCommit(commit);
} finally {
try {
proc.finish();
} finally {
proc.close();
}
}
if (log.isInfoEnabled()) {
log.info("Finished periodic deletion of expired docs on core: {}", core.getName());
}
} catch (IOException ioe) {
log.error("IOException in periodic deletion of expired docs: ", ioe);
// DO NOT RETHROW: ScheduledExecutor will suppress subsequent executions
} catch (RuntimeException re) {
log.error("Runtime error in periodic deletion of expired docs: ", re);
// DO NOT RETHROW: ScheduledExecutor will suppress subsequent executions
} finally {
SolrRequestInfo.clearRequestInfo();
}
} finally {
req.close();
}
}
}
/**
* Helper method that returns true if the Runnable managed by this factory should be responsible
* of doing periodical deletes.
*
* In simple standalone installations this method always returns true, but in cloud mode it
* will be true if and only if we are currently the leader of the (active) slice with the first
* name (lexicographically).
*
*
If this method returns false, it may have also logged a message letting the user know why we
* aren't attempting period deletion (but it will attempt to not log this excessively)
*/
private boolean iAmInChargeOfPeriodicDeletes() {
ZkController zk = core.getCoreContainer().getZkController();
if (null == zk) return true;
// This is a lot simpler then doing our own "leader" election across all replicas
// of all shards since:
// a) we already have a per shard leader
// b) shard names must be unique
// c) ClusterState is already being "watched" by ZkController, no additional zk hits
// d) there might be multiple instances of this factory (in multiple chains) per
// collection, so picking an ephemeral node name for our election would be tricky
CloudDescriptor desc = core.getCoreDescriptor().getCloudDescriptor();
String col = desc.getCollectionName();
DocCollection docCollection = zk.getClusterState().getCollection(col);
if (docCollection.getActiveSlicesArr().length == 0) {
log.error("Collection {} has no active Slices?", col);
return false;
}
List slices = new ArrayList<>(Arrays.asList(docCollection.getActiveSlicesArr()));
slices.sort(COMPARE_SLICES_BY_NAME);
Replica firstSliceLeader = slices.get(0).getLeader();
if (null == firstSliceLeader) {
log.warn("Slice in charge of periodic deletes for {} does not currently have a leader", col);
return false;
}
String leaderInCharge = firstSliceLeader.getName();
String myCoreNodeName = desc.getCoreNodeName();
boolean inChargeOfDeletesRightNow = leaderInCharge.equals(myCoreNodeName);
if (previouslyInChargeOfDeletes && !inChargeOfDeletesRightNow) {
// don't spam the logs constantly, just log when we know that we're not the guy
// (the first time -- or anytime we were, but no longer are)
log.info(
"Not currently in charge of periodic deletes for this collection, {}",
"will not trigger delete or log again until this changes");
}
previouslyInChargeOfDeletes = inChargeOfDeletesRightNow;
return inChargeOfDeletesRightNow;
}
/**
* @see #iAmInChargeOfPeriodicDeletes
*/
private volatile boolean previouslyInChargeOfDeletes = true;
private static final Comparator COMPARE_SLICES_BY_NAME =
(a, b) -> a.getName().compareTo(b.getName());
}