org.wali.MinimalLockingWriteAheadLog Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nifi-write-ahead-log Show documentation
There is a newer version: 2.1.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.wali;

import static java.util.Objects.requireNonNull;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.regex.Pattern;

import org.apache.nifi.wali.SequentialAccessWriteAheadLog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 
 * This implementation provides as little Locking as possible in order to
 * provide the highest throughput possible. However, this implementation is ONLY
 * appropriate if it can be guaranteed that only a single thread will ever issue
 * updates for a given Record at any one time.
 * 
 *
 * @param  type of record this WAL is for
 *
 * @deprecated This implementation is now deprecated in favor of {@link SequentialAccessWriteAheadLog}.
 *             This implementation, when given more than 1 partition, can have issues recovering after a sudden loss
 *             of power or an operating system crash.
 */
@Deprecated
public final class MinimalLockingWriteAheadLog implements WriteAheadRepository {

    private final Path basePath;
    private final Path partialPath;
    private final Path snapshotPath;

    private final SerDeFactory serdeFactory;
    private final SyncListener syncListener;
    private final FileChannel lockChannel;
    private final AtomicLong transactionIdGenerator = new AtomicLong(0L);

    private final Partition[] partitions;
    private final AtomicLong partitionIndex = new AtomicLong(0L);
    private final ConcurrentMap recordMap = new ConcurrentHashMap<>();
    private final Map unmodifiableRecordMap = Collections.unmodifiableMap(recordMap);
    private final Set externalLocations = new CopyOnWriteArraySet<>();

    private final Set recoveredExternalLocations = new CopyOnWriteArraySet<>();

    private final AtomicInteger numberBlackListedPartitions = new AtomicInteger(0);

    private static final Logger logger = LoggerFactory.getLogger(MinimalLockingWriteAheadLog.class);

    private final ReadWriteLock rwLock = new ReentrantReadWriteLock();
    private final Lock readLock = rwLock.readLock(); // required to update a partition
    private final Lock writeLock = rwLock.writeLock(); // required for checkpoint

    private volatile boolean updated = false;
    private volatile boolean recovered = false;

    public MinimalLockingWriteAheadLog(final Path path, final int partitionCount, final SerDe serde, final SyncListener syncListener) throws IOException {
        this(new TreeSet<>(Collections.singleton(path)), partitionCount, new SingletonSerDeFactory<>(serde), syncListener);
    }

    public MinimalLockingWriteAheadLog(final Path path, final int partitionCount, final SerDeFactory serdeFactory, final SyncListener syncListener) throws IOException {
        this(new TreeSet<>(Collections.singleton(path)), partitionCount, serdeFactory, syncListener);
    }

    public MinimalLockingWriteAheadLog(final SortedSet paths, final int partitionCount, final SerDe serde, final SyncListener syncListener) throws IOException {
        this(paths, partitionCount, new SingletonSerDeFactory<>(serde), syncListener);
    }

    /**
     *
     * @param paths a sorted set of Paths to use for the partitions/journals and
     * the snapshot. The snapshot will always be written to the first path
     * specified.
     * @param partitionCount the number of partitions/journals to use. For best
     * performance, this should be close to the number of threads that are
     * expected to update the repository simultaneously
     * @param serdeFactory the factory for the serializer/deserializer for records
     * @param syncListener the listener
     * @throws IOException if unable to initialize due to IO issue
     */
    @SuppressWarnings("unchecked")
    public MinimalLockingWriteAheadLog(final SortedSet paths, final int partitionCount, final SerDeFactory serdeFactory, final SyncListener syncListener) throws IOException {
        this.syncListener = syncListener;

        requireNonNull(paths);
        requireNonNull(serdeFactory);

        if (paths.isEmpty()) {
            throw new IllegalArgumentException("Paths must be non-empty");
        }

        int resolvedPartitionCount = partitionCount;
        int existingPartitions = 0;
        for (final Path path : paths) {
            if (!Files.exists(path)) {
                Files.createDirectories(path);
            }

            final File file = path.toFile();
            if (!file.isDirectory()) {
                throw new IOException("Path given [" + path + "] is not a directory");
            }
            if (!file.canWrite()) {
                throw new IOException("Path given [" + path + "] is not writable");
            }
            if (!file.canRead()) {
                throw new IOException("Path given [" + path + "] is not readable");
            }
            if (!file.canExecute()) {
                throw new IOException("Path given [" + path + "] is not executable");
            }

            final File[] children = file.listFiles();
            if (children != null) {
                for (final File child : children) {
                    if (child.isDirectory() && child.getName().startsWith("partition-")) {
                        existingPartitions++;
                    }
                }

                if (existingPartitions != 0 && existingPartitions != partitionCount) {
                    logger.warn("Constructing MinimalLockingWriteAheadLog with partitionCount={}, but the repository currently has "
                            + "{} partitions; ignoring argument and proceeding with {} partitions",
                            new Object[]{partitionCount, existingPartitions, existingPartitions});
                    resolvedPartitionCount = existingPartitions;
                }
            }
        }

        this.basePath = paths.iterator().next();
        this.partialPath = basePath.resolve("snapshot.partial");
        this.snapshotPath = basePath.resolve("snapshot");
        this.serdeFactory = serdeFactory;

        final Path lockPath = basePath.resolve("wali.lock");
        lockChannel = new FileOutputStream(lockPath.toFile()).getChannel();
        lockChannel.lock();

        partitions = new Partition[resolvedPartitionCount];

        Iterator pathIterator = paths.iterator();
        for (int i = 0; i < resolvedPartitionCount; i++) {
            // If we're out of paths, create a new iterator to start over.
            if (!pathIterator.hasNext()) {
                pathIterator = paths.iterator();
            }

            final Path partitionBasePath = pathIterator.next();

            partitions[i] = new Partition<>(partitionBasePath.resolve("partition-" + i), serdeFactory, i, getVersion());
        }
    }

    @Override
    public int update(final Collection records, final boolean forceSync) throws IOException {
        if (!recovered) {
            throw new IllegalStateException("Cannot update repository until record recovery has been performed");
        }

        if (records.isEmpty()) {
            return -1;
        }

        updated = true;
        readLock.lock();
        try {
            while (true) {
                final int numBlackListed = numberBlackListedPartitions.get();
                if (numBlackListed >= partitions.length) {
                    throw new IOException("All Partitions have been blacklisted due to "
                            + "failures when attempting to update. If the Write-Ahead Log is able to perform a checkpoint, "
                            + "this issue may resolve itself. Otherwise, manual intervention will be required.");
                }

                final long partitionIdx = partitionIndex.getAndIncrement();
                final int resolvedIdx = (int) (partitionIdx % partitions.length);
                final Partition partition = partitions[resolvedIdx];
                if (partition.tryClaim()) {
                    try {
                        final long transactionId = transactionIdGenerator.getAndIncrement();
                        if (logger.isTraceEnabled()) {
                            for (final T record : records) {
                                logger.trace("Partition {} performing Transaction {}: {}", new Object[] {partition, transactionId, record});
                            }
                        }

                        try {
                            partition.update(records, transactionId, unmodifiableRecordMap, forceSync);
                        } catch (final Throwable t) {
                            partition.blackList();
                            numberBlackListedPartitions.incrementAndGet();
                            throw t;
                        }

                        if (forceSync && syncListener != null) {
                            syncListener.onSync(resolvedIdx);
                        }
                    } finally {
                        partition.releaseClaim();
                    }

                    for (final T record : records) {
                        final UpdateType updateType = serdeFactory.getUpdateType(record);
                        final Object recordIdentifier = serdeFactory.getRecordIdentifier(record);

                        if (updateType == UpdateType.DELETE) {
                            recordMap.remove(recordIdentifier);
                        } else if (updateType == UpdateType.SWAP_OUT) {
                            final String newLocation = serdeFactory.getLocation(record);
                            if (newLocation == null) {
                                logger.error("Received Record (ID=" + recordIdentifier + ") with UpdateType of SWAP_OUT but "
                                        + "no indicator of where the Record is to be Swapped Out to; these records may be "
                                        + "lost when the repository is restored!");
                            } else {
                                recordMap.remove(recordIdentifier);
                                this.externalLocations.add(newLocation);
                            }
                        } else if (updateType == UpdateType.SWAP_IN) {
                            final String newLocation = serdeFactory.getLocation(record);
                            if (newLocation == null) {
                                logger.error("Received Record (ID=" + recordIdentifier + ") with UpdateType of SWAP_IN but no "
                                        + "indicator of where the Record is to be Swapped In from; these records may be duplicated "
                                        + "when the repository is restored!");
                            } else {
                                externalLocations.remove(newLocation);
                            }
                            recordMap.put(recordIdentifier, record);
                        } else {
                            recordMap.put(recordIdentifier, record);
                        }
                    }

                    return resolvedIdx;
                }
            }
        } finally {
            readLock.unlock();
        }
    }

    @Override
    public Collection recoverRecords() throws IOException {
        if (updated) {
            throw new IllegalStateException("Cannot recover records after updating the repository; must call recoverRecords first");
        }

        final long recoverStart = System.nanoTime();
        writeLock.lock();
        try {
            Long maxTransactionId = recoverFromSnapshot(recordMap);
            recoverFromEdits(recordMap, maxTransactionId);

            for (final Partition partition : partitions) {
                final long transId = partition.getMaxRecoveredTransactionId();
                if (maxTransactionId == null || transId > maxTransactionId) {
                    maxTransactionId = transId;
                }
            }

            this.transactionIdGenerator.set(maxTransactionId + 1);
            this.externalLocations.addAll(recoveredExternalLocations);
            logger.info("{} finished recovering records. Performing Checkpoint to ensure proper state of Partitions before updates", this);
        } finally {
            writeLock.unlock();
        }
        final long recoverNanos = System.nanoTime() - recoverStart;
        final long recoveryMillis = TimeUnit.MILLISECONDS.convert(recoverNanos, TimeUnit.NANOSECONDS);
        logger.info("Successfully recovered {} records in {} milliseconds", recordMap.size(), recoveryMillis);
        checkpoint();

        recovered = true;
        return recordMap.values();
    }

    @Override
    public Set getRecoveredSwapLocations() throws IOException {
        return recoveredExternalLocations;
    }

    private Long recoverFromSnapshot(final Map recordMap) throws IOException {
        final boolean partialExists = Files.exists(partialPath);
        final boolean snapshotExists = Files.exists(snapshotPath);

        if (!partialExists && !snapshotExists) {
            return null;
        }

        if (partialExists && snapshotExists) {
            // both files exist -- assume we failed while checkpointing. Delete
            // the partial file
            Files.delete(partialPath);
        } else if (partialExists) {
            // partial exists but snapshot does not -- we must have completed
            // creating the partial, deleted the snapshot
            // but crashed before renaming the partial to the snapshot. Just
            // rename partial to snapshot
            Files.move(partialPath, snapshotPath);
        }

        if (Files.size(snapshotPath) == 0) {
            logger.warn("{} Found 0-byte Snapshot file; skipping Snapshot file in recovery", this);
            return null;
        }

        // at this point, we know the snapshotPath exists because if it didn't, then we either returned null
        // or we renamed partialPath to snapshotPath. So just Recover from snapshotPath.
        try (final DataInputStream dataIn = new DataInputStream(new BufferedInputStream(Files.newInputStream(snapshotPath, StandardOpenOption.READ)))) {
            final String waliImplementationClass = dataIn.readUTF();
            final int waliImplementationVersion = dataIn.readInt();

            if (!waliImplementationClass.equals(MinimalLockingWriteAheadLog.class.getName())) {
                throw new IOException("Write-Ahead Log located at " + snapshotPath + " was written using the "
                        + waliImplementationClass + " class; cannot restore using " + getClass().getName());
            }

            if (waliImplementationVersion > getVersion()) {
                throw new IOException("Write-Ahead Log located at " + snapshotPath + " was written using version "
                        + waliImplementationVersion + " of the " + waliImplementationClass + " class; cannot restore using Version " + getVersion());
            }

            final String serdeEncoding = dataIn.readUTF(); // ignore serde class name for now
            final int serdeVersion = dataIn.readInt();
            final long maxTransactionId = dataIn.readLong();
            final int numRecords = dataIn.readInt();

            final SerDe serde = serdeFactory.createSerDe(serdeEncoding);
            serde.readHeader(dataIn);

            for (int i = 0; i < numRecords; i++) {
                final T record = serde.deserializeRecord(dataIn, serdeVersion);
                if (record == null) {
                    throw new EOFException();
                }

                final UpdateType updateType = serde.getUpdateType(record);
                if (updateType == UpdateType.DELETE) {
                    logger.warn("While recovering from snapshot, found record with type 'DELETE'; this record will not be restored");
                    continue;
                }

                logger.trace("Recovered from snapshot: {}", record);
                recordMap.put(serde.getRecordIdentifier(record), record);
            }

            final int numSwapRecords = dataIn.readInt();
            final Set swapLocations = new HashSet<>();
            for (int i = 0; i < numSwapRecords; i++) {
                swapLocations.add(dataIn.readUTF());
            }
            this.recoveredExternalLocations.addAll(swapLocations);

            logger.debug("{} restored {} Records and {} Swap Files from Snapshot, ending with Transaction ID {}",
                    new Object[]{this, numRecords, recoveredExternalLocations.size(), maxTransactionId});
            return maxTransactionId;
        }
    }

    /**
     * Recovers records from the edit logs via the Partitions. Returns a boolean
     * if recovery of a Partition requires the Write-Ahead Log be checkpointed
     * before modification.
     *
     * @param modifiableRecordMap map
     * @param maxTransactionIdRestored index of max restored transaction
     * @throws IOException if unable to recover from edits
     */
    private void recoverFromEdits(final Map modifiableRecordMap, final Long maxTransactionIdRestored) throws IOException {
        final Map updateMap = new HashMap<>();
        final Map unmodifiableRecordMap = Collections.unmodifiableMap(modifiableRecordMap);
        final Map ignorableMap = new HashMap<>();
        final Set ignorableSwapLocations = new HashSet<>();

        // populate a map of the next transaction id for each partition to the
        // partition that has that next transaction id.
        final SortedMap> transactionMap = new TreeMap<>();
        for (final Partition partition : partitions) {
            Long transactionId;
            boolean keepTransaction;
            do {
                transactionId = partition.getNextRecoverableTransactionId();

                keepTransaction = transactionId == null || maxTransactionIdRestored == null || transactionId > maxTransactionIdRestored;
                if (keepTransaction && transactionId != null) {
                    // map this transaction id to its partition so that we can
                    // start restoring transactions from this partition,
                    // starting at 'transactionId'
                    transactionMap.put(transactionId, partition);
                } else if (transactionId != null) {
                    // skip the next transaction, because our snapshot already
                    // contained this transaction.
                    try {
                        partition.recoverNextTransaction(ignorableMap, updateMap, ignorableSwapLocations);
                    } catch (final EOFException e) {
                        logger.error("{} unexpectedly reached End of File while reading from {} for Transaction {}; "
                                + "assuming crash and ignoring this transaction.",
                                new Object[]{this, partition, transactionId});
                    }
                }
            } while (!keepTransaction);
        }

        while (!transactionMap.isEmpty()) {
            final Map.Entry> firstEntry = transactionMap.entrySet().iterator().next();
            final Long firstTransactionId = firstEntry.getKey();
            final Partition nextPartition = firstEntry.getValue();

            try {
                updateMap.clear();
                final Set