org.elasticsearch.index.engine.InternalEngine Maven / Gradle / Ivy
The newest version!
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.engine;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LiveIndexWriterConfig;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.cluster.routing.DjbHashFunction;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.lucene.LoggerInfoStream;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.index.ElasticsearchDirectoryReader;
import org.elasticsearch.common.lucene.index.ElasticsearchLeafReader;
import org.elasticsearch.common.lucene.uid.Versions;
import org.elasticsearch.common.math.MathUtils;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.common.util.concurrent.KeyedLock;
import org.elasticsearch.common.util.concurrent.ReleasableLock;
import org.elasticsearch.index.deletionpolicy.SnapshotIndexCommit;
import org.elasticsearch.index.indexing.ShardIndexingService;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.merge.MergeStats;
import org.elasticsearch.index.merge.OnGoingMerge;
import org.elasticsearch.index.search.nested.IncludeNestedDocsQuery;
import org.elasticsearch.index.shard.ElasticsearchMergePolicy;
import org.elasticsearch.index.shard.MergeSchedulerConfig;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.shard.TranslogRecoveryPerformer;
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.index.translog.TranslogConfig;
import org.elasticsearch.index.translog.TranslogCorruptedException;
import org.elasticsearch.indices.IndicesWarmer;
import org.elasticsearch.threadpool.ThreadPool;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
*
*/
public class InternalEngine extends Engine {
/**
* When we last pruned expired tombstones from versionMap.deletes:
*/
private volatile long lastDeleteVersionPruneTimeMSec;
private final ShardIndexingService indexingService;
@Nullable
private final IndicesWarmer warmer;
private final Translog translog;
private final ElasticsearchConcurrentMergeScheduler mergeScheduler;
private final IndexWriter indexWriter;
private final SearcherFactory searcherFactory;
private final SearcherManager searcherManager;
private final Lock flushLock = new ReentrantLock();
private final ReentrantLock optimizeLock = new ReentrantLock();
// A uid (in the form of BytesRef) to the version map
// we use the hashed variant since we iterate over it and check removal and additions on existing keys
private final LiveVersionMap versionMap;
private final KeyedLock keyedLock = new KeyedLock<>();
private final AtomicBoolean versionMapRefreshPending = new AtomicBoolean();
private volatile SegmentInfos lastCommittedSegmentInfos;
private final IndexThrottle throttle;
public InternalEngine(EngineConfig engineConfig, boolean skipInitialTranslogRecovery) throws EngineException {
super(engineConfig);
this.versionMap = new LiveVersionMap();
store.incRef();
IndexWriter writer = null;
Translog translog = null;
SearcherManager manager = null;
EngineMergeScheduler scheduler = null;
boolean success = false;
try {
this.lastDeleteVersionPruneTimeMSec = engineConfig.getThreadPool().estimatedTimeInMillis();
this.indexingService = engineConfig.getIndexingService();
this.warmer = engineConfig.getWarmer();
mergeScheduler = scheduler = new EngineMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings(),
engineConfig.getMergeSchedulerConfig());
throttle = new IndexThrottle();
this.searcherFactory = new SearchFactory(logger, isClosed, engineConfig);
final Translog.TranslogGeneration translogGeneration;
try {
final boolean create = engineConfig.isCreate();
writer = createWriter(create);
indexWriter = writer;
translog = openTranslog(engineConfig, writer, create || skipInitialTranslogRecovery || engineConfig.forceNewTranslog());
translogGeneration = translog.getGeneration();
assert translogGeneration != null;
} catch (IOException | TranslogCorruptedException e) {
throw new EngineCreationFailureException(shardId, "failed to create engine", e);
} catch (AssertionError e) {
// IndexWriter throws AssertionError on init, if asserts are enabled, if any files don't exist, but tests that
// randomly throw FNFE/NSFE can also hit this:
if (ExceptionsHelper.stackTrace(e).contains("org.apache.lucene.index.IndexWriter.filesExist")) {
throw new EngineCreationFailureException(shardId, "failed to create engine", e);
} else {
throw e;
}
}
this.translog = translog;
manager = createSearcherManager();
this.searcherManager = manager;
this.versionMap.setManager(searcherManager);
try {
if (skipInitialTranslogRecovery) {
// make sure we point at the latest translog from now on..
commitIndexWriter(writer, translog, lastCommittedSegmentInfos.getUserData().get(SYNC_COMMIT_ID));
} else {
recoverFromTranslog(engineConfig, translogGeneration);
}
} catch (IOException | EngineException ex) {
throw new EngineCreationFailureException(shardId, "failed to recover from translog", ex);
}
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(writer, translog, manager, scheduler);
versionMap.clear();
if (isClosed.get() == false) {
// failure we need to dec the store reference
store.decRef();
}
}
}
logger.trace("created new InternalEngine");
}
private Translog openTranslog(EngineConfig engineConfig, IndexWriter writer, boolean createNew) throws IOException {
final Translog.TranslogGeneration generation = loadTranslogIdFromCommit(writer);
final TranslogConfig translogConfig = engineConfig.getTranslogConfig();
if (createNew == false) {
// We expect that this shard already exists, so it must already have an existing translog else something is badly wrong!
if (generation == null) {
throw new IllegalStateException("no translog generation present in commit data but translog is expected to exist");
}
translogConfig.setTranslogGeneration(generation);
if (generation != null && generation.translogUUID == null) {
// only upgrade on pre-2.0 indices...
Translog.upgradeLegacyTranslog(logger, translogConfig);
}
}
final Translog translog = new Translog(translogConfig);
if (generation == null || generation.translogUUID == null) {
if (generation == null) {
logger.debug("no translog ID present in the current generation - creating one");
} else if (generation.translogUUID == null) {
logger.debug("upgraded translog to pre 2.0 format, associating translog with index - writing translog UUID");
}
boolean success = false;
try {
commitIndexWriter(writer, translog);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(translog);
}
}
}
return translog;
}
@Override
public Translog getTranslog() {
ensureOpen();
return translog;
}
protected void recoverFromTranslog(EngineConfig engineConfig, Translog.TranslogGeneration translogGeneration) throws IOException {
int opsRecovered = 0;
final TranslogRecoveryPerformer handler = engineConfig.getTranslogRecoveryPerformer();
try (Translog.Snapshot snapshot = translog.newSnapshot()) {
opsRecovered = handler.recoveryFromSnapshot(this, snapshot);
} catch (Throwable e) {
throw new EngineException(shardId, "failed to recover from translog", e);
}
// flush if we recovered something or if we have references to older translogs
// note: if opsRecovered == 0 and we have older translogs it means they are corrupted or 0 length.
if (opsRecovered > 0) {
logger.trace("flushing post recovery from translog. ops recovered [{}]. committed translog id [{}]. current id [{}]",
opsRecovered, translogGeneration == null ? null : translogGeneration.translogFileGeneration, translog
.currentFileGeneration());
flush(true, true);
} else if (translog.isCurrent(translogGeneration) == false) {
commitIndexWriter(indexWriter, translog, lastCommittedSegmentInfos.getUserData().get(Engine.SYNC_COMMIT_ID));
}
}
/**
* Reads the current stored translog ID from the IW commit data. If the id is not found, recommits the current
* translog id into lucene and returns null.
*/
@Nullable
private Translog.TranslogGeneration loadTranslogIdFromCommit(IndexWriter writer) throws IOException {
// commit on a just opened writer will commit even if there are no changes done to it
// we rely on that for the commit data translog id key
final Map commitUserData = writer.getCommitData();
if (commitUserData.containsKey("translog_id")) {
assert commitUserData.containsKey(Translog.TRANSLOG_UUID_KEY) == false : "legacy commit contains translog UUID";
return new Translog.TranslogGeneration(null, Long.parseLong(commitUserData.get("translog_id")));
} else if (commitUserData.containsKey(Translog.TRANSLOG_GENERATION_KEY)) {
if (commitUserData.containsKey(Translog.TRANSLOG_UUID_KEY) == false) {
throw new IllegalStateException("commit doesn't contain translog UUID");
}
final String translogUUID = commitUserData.get(Translog.TRANSLOG_UUID_KEY);
final long translogGen = Long.parseLong(commitUserData.get(Translog.TRANSLOG_GENERATION_KEY));
return new Translog.TranslogGeneration(translogUUID, translogGen);
}
return null;
}
private SearcherManager createSearcherManager() throws EngineException {
boolean success = false;
SearcherManager searcherManager = null;
try {
try {
final DirectoryReader directoryReader = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(indexWriter, true), shardId);
searcherManager = new SearcherManager(directoryReader, searcherFactory);
lastCommittedSegmentInfos = readLastCommittedSegmentInfos(searcherManager, store);
success = true;
return searcherManager;
} catch (IOException e) {
maybeFailEngine("start", e);
try {
indexWriter.rollback();
} catch (IOException e1) { // iw is closed below
e.addSuppressed(e1);
}
throw new EngineCreationFailureException(shardId, "failed to open reader on writer", e);
}
} finally {
if (success == false) { // release everything we created on a failure
IOUtils.closeWhileHandlingException(searcherManager, indexWriter);
}
}
}
private void updateIndexWriterSettings() {
try {
final LiveIndexWriterConfig iwc = indexWriter.getConfig();
iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac());
iwc.setUseCompoundFile(engineConfig.isCompoundOnFlush());
} catch (AlreadyClosedException ex) {
// ignore
}
}
@Override
public GetResult get(Get get) throws EngineException {
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
if (get.realtime()) {
VersionValue versionValue = versionMap.getUnderLock(get.uid().bytes());
if (versionValue != null) {
if (versionValue.delete()) {
return GetResult.NOT_EXISTS;
}
if (get.versionType().isVersionConflictForReads(versionValue.version(), get.version())) {
Uid uid = Uid.createUid(get.uid().text());
throw new VersionConflictEngineException(shardId, uid.type(), uid.id(), versionValue.version(), get.version());
}
Translog.Operation op = translog.read(versionValue.translogLocation());
if (op != null) {
return new GetResult(true, versionValue.version(), op.getSource());
}
}
}
// no version, get the version from the index, we know that we refresh on flush
return getFromSearcher(get);
}
}
@Override
public void create(Create create) throws EngineException {
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
if (create.origin() == Operation.Origin.RECOVERY) {
// Don't throttle recovery operations
innerCreate(create);
} else {
try (Releasable r = throttle.acquireThrottle()) {
innerCreate(create);
}
}
} catch (OutOfMemoryError | IllegalStateException | IOException t) {
maybeFailEngine("create", t);
throw new CreateFailedEngineException(shardId, create.type(), create.id(), t);
}
checkVersionMapRefresh();
}
private void innerCreate(Create create) throws IOException {
if (engineConfig.isOptimizeAutoGenerateId() && create.autoGeneratedId() && !create.canHaveDuplicates()) {
// We don't need to lock because this ID cannot be concurrently updated:
innerCreateNoLock(create, Versions.NOT_FOUND, null);
} else {
try (Releasable ignored = acquireLock(create.uid())) {
final long currentVersion;
final VersionValue versionValue;
versionValue = versionMap.getUnderLock(create.uid().bytes());
if (versionValue == null) {
currentVersion = loadCurrentVersionFromIndex(create.uid());
} else {
if (engineConfig.isEnableGcDeletes() && versionValue.delete() && (engineConfig.getThreadPool().estimatedTimeInMillis
() - versionValue.time()) > engineConfig.getGcDeletesInMillis()) {
currentVersion = Versions.NOT_FOUND; // deleted, and GC
} else {
currentVersion = versionValue.version();
}
}
innerCreateNoLock(create, currentVersion, versionValue);
}
}
}
private void innerCreateNoLock(Create create, long currentVersion, VersionValue versionValue) throws IOException {
// same logic as index
long updatedVersion;
long expectedVersion = create.version();
if (create.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) {
if (create.origin() == Operation.Origin.RECOVERY) {
return;
} else {
throw new VersionConflictEngineException(shardId, create.type(), create.id(), currentVersion, expectedVersion);
}
}
updatedVersion = create.versionType().updateVersion(currentVersion, expectedVersion);
// if the doc exists
boolean doUpdate = false;
if ((versionValue != null && versionValue.delete() == false) || (versionValue == null && currentVersion != Versions.NOT_FOUND)) {
if (create.origin() == Operation.Origin.RECOVERY) {
return;
} else if (create.origin() == Operation.Origin.REPLICA) {
// #7142: the primary already determined it's OK to index this document, and we confirmed above that the version doesn't
// conflict, so we must also update here on the replica to remain consistent:
doUpdate = true;
} else if (create.origin() == Operation.Origin.PRIMARY && create.autoGeneratedId() && create.canHaveDuplicates() &&
currentVersion == 1 && create.version() == Versions.MATCH_ANY) {
/**
* If bulk index request fails due to a disconnect, unavailable shard etc. then the request is
* retried before it actually fails. However, the documents might already be indexed.
* For autogenerated ids this means that a version conflict will be reported in the bulk request
* although the document was indexed properly.
* To avoid this we have to make sure that the index request is treated as an update and set updatedVersion to 1.
* See also discussion on https://github.com/elasticsearch/elasticsearch/pull/9125
*/
doUpdate = true;
updatedVersion = 1;
} else {
// On primary, we throw DAEE if the _uid is already in the index with an older version:
assert create.origin() == Operation.Origin.PRIMARY;
throw new DocumentAlreadyExistsException(shardId, create.type(), create.id());
}
}
create.updateVersion(updatedVersion);
if (doUpdate) {
if (create.docs().size() > 1) {
indexWriter.updateDocuments(create.uid(), create.docs());
} else {
indexWriter.updateDocument(create.uid(), create.docs().get(0));
}
} else {
if (create.docs().size() > 1) {
indexWriter.addDocuments(create.docs());
} else {
indexWriter.addDocument(create.docs().get(0));
}
}
Translog.Location translogLocation = translog.add(new Translog.Create(create));
versionMap.putUnderLock(create.uid().bytes(), new VersionValue(updatedVersion, translogLocation));
create.setTranslogLocation(translogLocation);
indexingService.postCreateUnderLock(create);
}
@Override
public boolean index(Index index) throws EngineException {
final boolean created;
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
if (index.origin() == Operation.Origin.RECOVERY) {
// Don't throttle recovery operations
created = innerIndex(index);
} else {
try (Releasable r = throttle.acquireThrottle()) {
created = innerIndex(index);
}
}
} catch (OutOfMemoryError | IllegalStateException | IOException t) {
maybeFailEngine("index", t);
throw new IndexFailedEngineException(shardId, index.type(), index.id(), t);
}
checkVersionMapRefresh();
return created;
}
/**
* Forces a refresh if the versionMap is using too much RAM
*/
private void checkVersionMapRefresh() {
if (versionMap.ramBytesUsedForRefresh() > config().getVersionMapSize().bytes() && versionMapRefreshPending.getAndSet(true) ==
false) {
try {
if (isClosed.get()) {
// no point...
return;
}
// Now refresh to clear versionMap:
engineConfig.getThreadPool().executor(ThreadPool.Names.REFRESH).execute(new Runnable() {
@Override
public void run() {
try {
refresh("version_table_full");
} catch (EngineClosedException ex) {
// ignore
}
}
});
} catch (EsRejectedExecutionException ex) {
// that is fine too.. we might be shutting down
}
}
}
private boolean innerIndex(Index index) throws IOException {
try (Releasable ignored = acquireLock(index.uid())) {
final long currentVersion;
VersionValue versionValue = versionMap.getUnderLock(index.uid().bytes());
if (versionValue == null) {
currentVersion = loadCurrentVersionFromIndex(index.uid());
} else {
if (engineConfig.isEnableGcDeletes() && versionValue.delete() && (engineConfig.getThreadPool().estimatedTimeInMillis() -
versionValue.time()) > engineConfig.getGcDeletesInMillis()) {
currentVersion = Versions.NOT_FOUND; // deleted, and GC
} else {
currentVersion = versionValue.version();
}
}
long updatedVersion;
long expectedVersion = index.version();
if (index.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) {
if (index.origin() == Operation.Origin.RECOVERY) {
return false;
} else {
throw new VersionConflictEngineException(shardId, index.type(), index.id(), currentVersion, expectedVersion);
}
}
updatedVersion = index.versionType().updateVersion(currentVersion, expectedVersion);
final boolean created;
index.updateVersion(updatedVersion);
if (currentVersion == Versions.NOT_FOUND) {
// document does not exists, we can optimize for create
created = true;
if (index.docs().size() > 1) {
indexWriter.addDocuments(index.docs());
} else {
indexWriter.addDocument(index.docs().get(0));
}
} else {
if (versionValue != null) {
created = versionValue.delete(); // we have a delete which is not GC'ed...
} else {
created = false;
}
if (index.docs().size() > 1) {
indexWriter.updateDocuments(index.uid(), index.docs());
} else {
indexWriter.updateDocument(index.uid(), index.docs().get(0));
}
}
Translog.Location translogLocation = translog.add(new Translog.Index(index));
versionMap.putUnderLock(index.uid().bytes(), new VersionValue(updatedVersion, translogLocation));
index.setTranslogLocation(translogLocation);
indexingService.postIndexUnderLock(index);
return created;
}
}
@Override
public void delete(Delete delete) throws EngineException {
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
// NOTE: we don't throttle this when merges fall behind because delete-by-id does not create new segments:
innerDelete(delete);
} catch (OutOfMemoryError | IllegalStateException | IOException t) {
maybeFailEngine("delete", t);
throw new DeleteFailedEngineException(shardId, delete, t);
}
maybePruneDeletedTombstones();
checkVersionMapRefresh();
}
private void maybePruneDeletedTombstones() {
// It's expensive to prune because we walk the deletes map acquiring dirtyLock for each uid so we only do it
// every 1/4 of gcDeletesInMillis:
if (engineConfig.isEnableGcDeletes() && engineConfig.getThreadPool().estimatedTimeInMillis() - lastDeleteVersionPruneTimeMSec >
engineConfig.getGcDeletesInMillis() * 0.25) {
pruneDeletedTombstones();
}
}
private void innerDelete(Delete delete) throws IOException {
try (Releasable ignored = acquireLock(delete.uid())) {
final long currentVersion;
VersionValue versionValue = versionMap.getUnderLock(delete.uid().bytes());
if (versionValue == null) {
currentVersion = loadCurrentVersionFromIndex(delete.uid());
} else {
if (engineConfig.isEnableGcDeletes() && versionValue.delete() && (engineConfig.getThreadPool().estimatedTimeInMillis() -
versionValue.time()) > engineConfig.getGcDeletesInMillis()) {
currentVersion = Versions.NOT_FOUND; // deleted, and GC
} else {
currentVersion = versionValue.version();
}
}
long updatedVersion;
long expectedVersion = delete.version();
if (delete.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) {
if (delete.origin() == Operation.Origin.RECOVERY) {
return;
} else {
throw new VersionConflictEngineException(shardId, delete.type(), delete.id(), currentVersion, expectedVersion);
}
}
updatedVersion = delete.versionType().updateVersion(currentVersion, expectedVersion);
final boolean found;
if (currentVersion == Versions.NOT_FOUND) {
// doc does not exist and no prior deletes
found = false;
} else if (versionValue != null && versionValue.delete()) {
// a "delete on delete", in this case, we still increment the version, log it, and return that version
found = false;
} else {
// we deleted a currently existing document
indexWriter.deleteDocuments(delete.uid());
found = true;
}
delete.updateVersion(updatedVersion, found);
Translog.Location translogLocation = translog.add(new Translog.Delete(delete));
versionMap.putUnderLock(delete.uid().bytes(), new DeleteVersionValue(updatedVersion, engineConfig.getThreadPool()
.estimatedTimeInMillis(), translogLocation));
delete.setTranslogLocation(translogLocation);
indexingService.postDeleteUnderLock(delete);
}
}
/** @deprecated This was removed, but we keep this API so translog can replay any DBQs on upgrade. */
@Deprecated
@Override
public void delete(DeleteByQuery delete) throws EngineException {
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
if (delete.origin() == Operation.Origin.RECOVERY) {
// Don't throttle recovery operations
innerDelete(delete);
} else {
try (Releasable r = throttle.acquireThrottle()) {
innerDelete(delete);
}
}
}
}
private void innerDelete(DeleteByQuery delete) throws EngineException {
try {
Query query = delete.query();
if (delete.aliasFilter() != null) {
query = new BooleanQuery.Builder()
.add(query, Occur.MUST)
.add(delete.aliasFilter(), Occur.FILTER)
.build();
}
if (delete.nested()) {
query = new IncludeNestedDocsQuery(query, delete.parentFilter());
}
indexWriter.deleteDocuments(query);
translog.add(new Translog.DeleteByQuery(delete));
} catch (Throwable t) {
maybeFailEngine("delete_by_query", t);
throw new DeleteByQueryFailedEngineException(shardId, delete, t);
}
// TODO: This is heavy, since we refresh, but we must do this because we don't know which documents were in fact deleted (i.e., our
// versionMap isn't updated), so we must force a cutover to a new reader to "see" the deletions:
refresh("delete_by_query");
}
@Override
public void refresh(String source) throws EngineException {
// we obtain a read lock here, since we don't want a flush to happen while we are refreshing
// since it flushes the index as well (though, in terms of concurrency, we are allowed to do it)
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
searcherManager.maybeRefreshBlocking();
} catch (AlreadyClosedException e) {
ensureOpen();
maybeFailEngine("refresh", e);
} catch (EngineClosedException e) {
throw e;
} catch (Throwable t) {
failEngine("refresh failed", t);
throw new RefreshFailedEngineException(shardId, t);
}
// TODO: maybe we should just put a scheduled job in threadPool?
// We check for pruning in each delete request, but we also prune here e.g. in case a delete burst comes in and then no more deletes
// for a long time:
maybePruneDeletedTombstones();
versionMapRefreshPending.set(false);
mergeScheduler.refreshConfig();
}
@Override
public SyncedFlushResult syncFlush(String syncId, CommitId expectedCommitId) throws EngineException {
// best effort attempt before we acquire locks
ensureOpen();
if (indexWriter.hasUncommittedChanges()) {
logger.trace("can't sync commit [{}]. have pending changes", syncId);
return SyncedFlushResult.PENDING_OPERATIONS;
}
if (expectedCommitId.idsEqual(lastCommittedSegmentInfos.getId()) == false) {
logger.trace("can't sync commit [{}]. current commit id is not equal to expected.", syncId);
return SyncedFlushResult.COMMIT_MISMATCH;
}
try (ReleasableLock lock = writeLock.acquire()) {
ensureOpen();
if (indexWriter.hasUncommittedChanges()) {
logger.trace("can't sync commit [{}]. have pending changes", syncId);
return SyncedFlushResult.PENDING_OPERATIONS;
}
if (expectedCommitId.idsEqual(lastCommittedSegmentInfos.getId()) == false) {
logger.trace("can't sync commit [{}]. current commit id is not equal to expected.", syncId);
return SyncedFlushResult.COMMIT_MISMATCH;
}
logger.trace("starting sync commit [{}]", syncId);
commitIndexWriter(indexWriter, translog, syncId);
logger.debug("successfully sync committed. sync id [{}].", syncId);
lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo();
return SyncedFlushResult.SUCCESS;
} catch (IOException ex) {
maybeFailEngine("sync commit", ex);
throw new EngineException(shardId, "failed to sync commit", ex);
}
}
@Override
public CommitId flush() throws EngineException {
return flush(false, false);
}
@Override
public CommitId flush(boolean force, boolean waitIfOngoing) throws EngineException {
ensureOpen();
final byte[] newCommitId;
/*
* Unfortunately the lock order is important here. We have to acquire the readlock first otherwise
* if we are flushing at the end of the recovery while holding the write lock we can deadlock if:
* Thread 1: flushes via API and gets the flush lock but blocks on the readlock since Thread 2 has the writeLock
* Thread 2: flushes at the end of the recovery holding the writeLock and blocks on the flushLock owned by Thread 1
*/
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
if (flushLock.tryLock() == false) {
// if we can't get the lock right away we block if needed otherwise barf
if (waitIfOngoing) {
logger.trace("waiting for in-flight flush to finish");
flushLock.lock();
logger.trace("acquired flush lock after blocking");
} else {
throw new FlushNotAllowedEngineException(shardId, "already flushing...");
}
} else {
logger.trace("acquired flush lock immediately");
}
try {
if (indexWriter.hasUncommittedChanges() || force) {
try {
translog.prepareCommit();
logger.trace("starting commit for flush; commitTranslog=true");
commitIndexWriter(indexWriter, translog);
logger.trace("finished commit for flush");
// we need to refresh in order to clear older version values
refresh("version_table_flush");
// after refresh documents can be retrieved from the index so we can now commit the translog
translog.commit();
} catch (Throwable e) {
throw new FlushFailedEngineException(shardId, e);
}
}
/*
* we have to inc-ref the store here since if the engine is closed by a tragic event
* we don't acquire the write lock and wait until we have exclusive access. This might also
* dec the store reference which can essentially close the store and unless we can inc the reference
* we can't use it.
*/
store.incRef();
try {
// reread the last committed segment infos
lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo();
} catch (Throwable e) {
if (isClosed.get() == false) {
logger.warn("failed to read latest segment infos on flush", e);
if (Lucene.isCorruptionException(e)) {
throw new FlushFailedEngineException(shardId, e);
}
}
} finally {
store.decRef();
}
newCommitId = lastCommittedSegmentInfos.getId();
} catch (FlushFailedEngineException ex) {
maybeFailEngine("flush", ex);
throw ex;
} finally {
flushLock.unlock();
}
}
// We don't have to do this here; we do it defensively to make sure that even if wall clock time is misbehaving
// (e.g., moves backwards) we will at least still sometimes prune deleted tombstones:
if (engineConfig.isEnableGcDeletes()) {
pruneDeletedTombstones();
}
return new CommitId(newCommitId);
}
private void pruneDeletedTombstones() {
long timeMSec = engineConfig.getThreadPool().estimatedTimeInMillis();
// TODO: not good that we reach into LiveVersionMap here; can we move this inside VersionMap instead? problem is the dirtyLock...
// we only need to prune the deletes map; the current/old version maps are cleared on refresh:
for (Map.Entry entry : versionMap.getAllTombstones()) {
BytesRef uid = entry.getKey();
try (Releasable ignored = acquireLock(uid)) { // can we do it without this lock on each value? maybe batch to a set and get the lock once
// per set?
// Must re-get it here, vs using entry.getValue(), in case the uid was indexed/deleted since we pulled the iterator:
VersionValue versionValue = versionMap.getTombstoneUnderLock(uid);
if (versionValue != null) {
if (timeMSec - versionValue.time() > engineConfig.getGcDeletesInMillis()) {
versionMap.removeTombstoneUnderLock(uid);
}
}
}
}
lastDeleteVersionPruneTimeMSec = timeMSec;
}
@Override
public void forceMerge(final boolean flush, int maxNumSegments, boolean onlyExpungeDeletes,
final boolean upgrade, final boolean upgradeOnlyAncientSegments) throws EngineException,
EngineClosedException, IOException {
/*
* We do NOT acquire the readlock here since we are waiting on the merges to finish
* that's fine since the IW.rollback should stop all the threads and trigger an IOException
* causing us to fail the forceMerge
*
* The way we implement upgrades is a bit hackish in the sense that we set an instance
* variable and that this setting will thus apply to the next forced merge that will be run.
* This is ok because (1) this is the only place we call forceMerge, (2) we have a single
* thread for optimize, and the 'optimizeLock' guarding this code, and (3) ConcurrentMergeScheduler
* syncs calls to findForcedMerges.
*/
assert indexWriter.getConfig().getMergePolicy() instanceof ElasticsearchMergePolicy : "MergePolicy is " + indexWriter.getConfig()
.getMergePolicy().getClass().getName();
ElasticsearchMergePolicy mp = (ElasticsearchMergePolicy) indexWriter.getConfig().getMergePolicy();
optimizeLock.lock();
try {
ensureOpen();
if (upgrade) {
logger.info("starting segment upgrade upgradeOnlyAncientSegments={}", upgradeOnlyAncientSegments);
mp.setUpgradeInProgress(true, upgradeOnlyAncientSegments);
}
store.incRef(); // increment the ref just to ensure nobody closes the store while we optimize
try {
if (onlyExpungeDeletes) {
assert upgrade == false;
indexWriter.forceMergeDeletes(true /* blocks and waits for merges*/);
} else if (maxNumSegments <= 0) {
assert upgrade == false;
indexWriter.maybeMerge();
} else {
indexWriter.forceMerge(maxNumSegments, true /* blocks and waits for merges*/);
}
if (flush) {
if (tryRenewSyncCommit() == false) {
flush(false, true);
}
}
if (upgrade) {
logger.info("finished segment upgrade");
}
} finally {
store.decRef();
}
} catch (Throwable t) {
maybeFailEngine("force merge", t);
throw t;
} finally {
try {
mp.setUpgradeInProgress(false, false); // reset it just to make sure we reset it in a case of an error
} finally {
optimizeLock.unlock();
}
}
}
@Override
public SnapshotIndexCommit snapshotIndex(final boolean flushFirst) throws EngineException {
// we have to flush outside of the readlock otherwise we might have a problem upgrading
// the to a write lock when we fail the engine in this operation
if (flushFirst) {
logger.trace("start flush for snapshot");
flush(false, true);
logger.trace("finish flush for snapshot");
}
try (ReleasableLock lock = readLock.acquire()) {
ensureOpen();
logger.trace("pulling snapshot");
return deletionPolicy.snapshot();
} catch (IOException e) {
throw new SnapshotFailedEngineException(shardId, e);
}
}
@Override
protected boolean maybeFailEngine(String source, Throwable t) {
boolean shouldFail = super.maybeFailEngine(source, t);
if (shouldFail) {
return true;
}
// Check for AlreadyClosedException
if (t instanceof AlreadyClosedException) {
// if we are already closed due to some tragic exception
// we need to fail the engine. it might have already been failed before
// but we are double-checking it's failed and closed
if (indexWriter.isOpen() == false && indexWriter.getTragicException() != null) {
failEngine("already closed by tragic event on the index writer", indexWriter.getTragicException());
} else if (translog.isOpen() == false && translog.getTragicException() != null) {
failEngine("already closed by tragic event on the translog", translog.getTragicException());
}
return true;
} else if (t != null &&
((indexWriter.isOpen() == false && indexWriter.getTragicException() == t)
|| (translog.isOpen() == false && translog.getTragicException() == t))) {
// this spot on - we are handling the tragic event exception here so we have to fail the engine
// right away
failEngine(source, t);
return true;
}
return false;
}
@Override
protected SegmentInfos getLastCommittedSegmentInfos() {
return lastCommittedSegmentInfos;
}
@Override
protected final void writerSegmentStats(SegmentsStats stats) {
stats.addVersionMapMemoryInBytes(versionMap.ramBytesUsed());
stats.addIndexWriterMemoryInBytes(indexWriter.ramBytesUsed());
stats.addIndexWriterMaxMemoryInBytes((long) (indexWriter.getConfig().getRAMBufferSizeMB() * 1024 * 1024));
}
@Override
public long indexWriterRAMBytesUsed() {
return indexWriter.ramBytesUsed();
}
@Override
public List segments(boolean verbose) {
try (ReleasableLock lock = readLock.acquire()) {
Segment[] segmentsArr = getSegmentInfo(lastCommittedSegmentInfos, verbose);
// fill in the merges flag
Set onGoingMerges = mergeScheduler.onGoingMerges();
for (OnGoingMerge onGoingMerge : onGoingMerges) {
for (SegmentCommitInfo segmentInfoPerCommit : onGoingMerge.getMergedSegments()) {
for (Segment segment : segmentsArr) {
if (segment.getName().equals(segmentInfoPerCommit.info.name)) {
segment.mergeId = onGoingMerge.getId();
break;
}
}
}
}
return Arrays.asList(segmentsArr);
}
}
/**
* Closes the engine without acquiring the write lock. This should only be
* called while the write lock is hold or in a disaster condition ie. if the engine
* is failed.
*/
@Override
protected final void closeNoLock(String reason) {
if (isClosed.compareAndSet(false, true)) {
assert rwl.isWriteLockedByCurrentThread() || failEngineLock.isHeldByCurrentThread() : "Either the write lock must be held or " +
"the engine must be currently be failing itself";
try {
this.versionMap.clear();
try {
IOUtils.close(searcherManager);
} catch (Throwable t) {
logger.warn("Failed to close SearcherManager", t);
}
try {
IOUtils.close(translog);
} catch (Throwable t) {
logger.warn("Failed to close translog", t);
}
// no need to commit in this case!, we snapshot before we close the shard, so translog and all sync'ed
logger.trace("rollback indexWriter");
try {
indexWriter.rollback();
} catch (AlreadyClosedException e) {
// ignore
}
logger.trace("rollback indexWriter done");
} catch (Throwable e) {
logger.warn("failed to rollback writer on close", e);
} finally {
store.decRef();
logger.debug("engine closed [{}]", reason);
}
}
}
@Override
public boolean hasUncommittedChanges() {
return indexWriter.hasUncommittedChanges();
}
@Override
protected SearcherManager getSearcherManager() {
return searcherManager;
}
private Releasable acquireLock(BytesRef uid) {
return keyedLock.acquire(uid);
}
private Releasable acquireLock(Term uid) {
return acquireLock(uid.bytes());
}
private long loadCurrentVersionFromIndex(Term uid) throws IOException {
try (final Searcher searcher = acquireSearcher("load_version", false)) {
return Versions.loadVersion(searcher.reader(), uid);
}
}
private IndexWriter createWriter(boolean create) throws IOException {
try {
final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
iwc.setCommitOnClose(false); // we by default don't commit on close
iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
iwc.setIndexDeletionPolicy(deletionPolicy);
// with tests.verbose, lucene sets this up: plumb to align with filesystem stream
boolean verbose = false;
try {
verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
} catch (Throwable ignore) {
}
iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
iwc.setMergeScheduler(mergeScheduler);
MergePolicy mergePolicy = config().getMergePolicy();
// Give us the opportunity to upgrade old segments while performing
// background merges
mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
iwc.setMergePolicy(mergePolicy);
iwc.setSimilarity(engineConfig.getSimilarity());
iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac());
iwc.setCodec(engineConfig.getCodec());
/* We set this timeout to a highish value to work around
* the default poll interval in the Lucene lock that is
* 1000ms by default. We might need to poll multiple times
* here but with 1s poll this is only executed twice at most
* in combination with the default writelock timeout*/
iwc.setWriteLockTimeout(5000);
iwc.setUseCompoundFile(this.engineConfig.isCompoundOnFlush());
// Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
// of the merge operation and won't slow down _refresh
iwc.setMergedSegmentWarmer(new IndexReaderWarmer() {
@Override
public void warm(LeafReader reader) throws IOException {
try {
LeafReader esLeafReader = new ElasticsearchLeafReader(reader, shardId);
assert isMergedSegment(esLeafReader);
if (warmer != null) {
final Engine.Searcher searcher = new Searcher("warmer", searcherFactory.newSearcher(esLeafReader, null));
final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, searcher);
warmer.warmNewReaders(context);
}
} catch (Throwable t) {
// Don't fail a merge if the warm-up failed
if (isClosed.get() == false) {
logger.warn("Warm-up failed", t);
}
if (t instanceof Error) {
// assertion/out-of-memory error, don't ignore those
throw (Error) t;
}
}
}
});
return new IndexWriter(store.directory(), iwc);
} catch (LockObtainFailedException ex) {
boolean isLocked = IndexWriter.isLocked(store.directory());
logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
throw ex;
}
}
/** Extended SearcherFactory that warms the segments if needed when acquiring a new searcher */
final static class SearchFactory extends EngineSearcherFactory {
private final IndicesWarmer warmer;
private final ShardId shardId;
private final ESLogger logger;
private final AtomicBoolean isEngineClosed;
SearchFactory(ESLogger logger, AtomicBoolean isEngineClosed, EngineConfig engineConfig) {
super(engineConfig);
warmer = engineConfig.getWarmer();
shardId = engineConfig.getShardId();
this.logger = logger;
this.isEngineClosed = isEngineClosed;
}
@Override
public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) throws IOException {
IndexSearcher searcher = super.newSearcher(reader, previousReader);
if (reader instanceof LeafReader && isMergedSegment((LeafReader) reader)) {
// we call newSearcher from the IndexReaderWarmer which warms segments during merging
// in that case the reader is a LeafReader and all we need to do is to build a new Searcher
// and return it since it does it's own warming for that particular reader.
return searcher;
}
if (warmer != null) {
// we need to pass a custom searcher that does not release anything on Engine.Search Release,
// we will release explicitly
IndexSearcher newSearcher = null;
boolean closeNewSearcher = false;
try {
if (previousReader == null) {
// we are starting up - no writer active so we can't acquire a searcher.
newSearcher = searcher;
} else {
// figure out the newSearcher, with only the new readers that are relevant for us
List readers = new ArrayList<>();
for (LeafReaderContext newReaderContext : reader.leaves()) {
if (isMergedSegment(newReaderContext.reader())) {
// merged segments are already handled by IndexWriterConfig.setMergedSegmentWarmer
continue;
}
boolean found = false;
for (LeafReaderContext currentReaderContext : previousReader.leaves()) {
if (currentReaderContext.reader().getCoreCacheKey().equals(newReaderContext.reader().getCoreCacheKey())) {
found = true;
break;
}
}
if (!found) {
readers.add(newReaderContext.reader());
}
}
if (!readers.isEmpty()) {
// we don't want to close the inner readers, just increase ref on them
IndexReader newReader = new MultiReader(readers.toArray(new IndexReader[readers.size()]), false);
newSearcher = super.newSearcher(newReader, null);
closeNewSearcher = true;
}
}
if (newSearcher != null) {
IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, new Searcher("new_reader_warming",
newSearcher));
warmer.warmNewReaders(context);
}
assert searcher.getIndexReader() instanceof ElasticsearchDirectoryReader : "this class needs an " +
"ElasticsearchDirectoryReader but got: " + searcher.getIndexReader().getClass();
warmer.warmTopReader(new IndicesWarmer.WarmerContext(shardId, new Searcher("top_reader_warming", searcher)));
} catch (Throwable e) {
if (isEngineClosed.get() == false) {
logger.warn("failed to prepare/warm", e);
}
} finally {
// no need to release the fullSearcher, nothing really is done...
if (newSearcher != null && closeNewSearcher) {
IOUtils.closeWhileHandlingException(newSearcher.getIndexReader()); // ignore
}
}
}
return searcher;
}
}
public void activateThrottling() {
throttle.activate();
}
public void deactivateThrottling() {
throttle.deactivate();
}
long getGcDeletesInMillis() {
return engineConfig.getGcDeletesInMillis();
}
LiveIndexWriterConfig getCurrentIndexWriterConfig() {
return indexWriter.getConfig();
}
private final class EngineMergeScheduler extends ElasticsearchConcurrentMergeScheduler {
private final AtomicInteger numMergesInFlight = new AtomicInteger(0);
private final AtomicBoolean isThrottling = new AtomicBoolean();
EngineMergeScheduler(ShardId shardId, Settings indexSettings, MergeSchedulerConfig config) {
super(shardId, indexSettings, config);
}
@Override
public synchronized void beforeMerge(OnGoingMerge merge) {
int maxNumMerges = mergeScheduler.getMaxMergeCount();
if (numMergesInFlight.incrementAndGet() > maxNumMerges) {
if (isThrottling.getAndSet(true) == false) {
logger.info("now throttling indexing: numMergesInFlight={}, maxNumMerges={}", numMergesInFlight, maxNumMerges);
indexingService.throttlingActivated();
activateThrottling();
}
}
}
@Override
public synchronized void afterMerge(OnGoingMerge merge) {
int maxNumMerges = mergeScheduler.getMaxMergeCount();
if (numMergesInFlight.decrementAndGet() < maxNumMerges) {
if (isThrottling.getAndSet(false)) {
logger.info("stop throttling indexing: numMergesInFlight={}, maxNumMerges={}", numMergesInFlight, maxNumMerges);
indexingService.throttlingDeactivated();
deactivateThrottling();
}
}
}
@Override
protected void handleMergeException(final Directory dir, final Throwable exc) {
logger.error("failed to merge", exc);
if (config().getMergeSchedulerConfig().isNotifyOnMergeFailure()) {
engineConfig.getThreadPool().generic().execute(new AbstractRunnable() {
@Override
public void onFailure(Throwable t) {
logger.debug("merge failure action rejected", t);
}
@Override
protected void doRun() throws Exception {
MergePolicy.MergeException e = new MergePolicy.MergeException(exc, dir);
failEngine("merge failed", e);
}
});
}
}
}
private void commitIndexWriter(IndexWriter writer, Translog translog, String syncId) throws IOException {
try {
Translog.TranslogGeneration translogGeneration = translog.getGeneration();
logger.trace("committing writer with translog id [{}] and sync id [{}] ", translogGeneration.translogFileGeneration, syncId);
Map commitData = new HashMap<>(2);
commitData.put(Translog.TRANSLOG_GENERATION_KEY, Long.toString(translogGeneration.translogFileGeneration));
commitData.put(Translog.TRANSLOG_UUID_KEY, translogGeneration.translogUUID);
if (syncId != null) {
commitData.put(Engine.SYNC_COMMIT_ID, syncId);
}
indexWriter.setCommitData(commitData);
writer.commit();
} catch (Throwable ex) {
failEngine("lucene commit failed", ex);
throw ex;
}
}
private void commitIndexWriter(IndexWriter writer, Translog translog) throws IOException {
commitIndexWriter(writer, translog, null);
}
public void onSettingsChanged() {
mergeScheduler.refreshConfig();
updateIndexWriterSettings();
// config().getVersionMapSize() may have changed:
checkVersionMapRefresh();
// config().isEnableGcDeletes() or config.getGcDeletesInMillis() may have changed:
maybePruneDeletedTombstones();
}
public MergeStats getMergeStats() {
return mergeScheduler.stats();
}
final boolean tryRenewSyncCommit() {
boolean renewed = false;
try (ReleasableLock lock = writeLock.acquire()) {
ensureOpen();
String syncId = lastCommittedSegmentInfos.getUserData().get(SYNC_COMMIT_ID);
if (syncId != null && translog.totalOperations() == 0 && indexWriter.hasUncommittedChanges()) {
logger.trace("start renewing sync commit [{}]", syncId);
commitIndexWriter(indexWriter, translog, syncId);
logger.debug("successfully sync committed. sync id [{}].", syncId);
lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo();
renewed = true;
}
} catch (IOException ex) {
maybeFailEngine("renew sync commit", ex);
throw new EngineException(shardId, "failed to renew sync commit", ex);
}
if (renewed) { // refresh outside of the write lock
refresh("renew sync commit");
}
return renewed;
}
}