Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/* This file is part of VoltDB.
* Copyright (C) 2008-2018 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see .
*/
package org.voltdb;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.hadoop_voltpatches.util.PureJavaCrc32;
import org.apache.hadoop_voltpatches.util.PureJavaCrc32C;
import org.json_voltpatches.JSONObject;
import org.json_voltpatches.JSONStringer;
import org.voltcore.logging.VoltLogger;
import org.voltcore.utils.Bits;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.DBBPool;
import org.voltcore.utils.DBBPool.BBContainer;
import org.voltdb.messaging.FastSerializer;
import org.voltdb.sysprocs.saverestore.SnapshotUtil;
import org.voltdb.utils.CompressionService;
import org.voltdb.utils.PosixAdvise;
import com.google_voltpatches.common.util.concurrent.Callables;
import com.google_voltpatches.common.util.concurrent.Futures;
import com.google_voltpatches.common.util.concurrent.ListenableFuture;
import com.google_voltpatches.common.util.concurrent.ListeningExecutorService;
import com.google_voltpatches.common.util.concurrent.ListeningScheduledExecutorService;
import com.google_voltpatches.common.util.concurrent.MoreExecutors;
import com.google_voltpatches.common.util.concurrent.UnsynchronizedRateLimiter;
public class DefaultSnapshotDataTarget implements SnapshotDataTarget {
/*
* Make it possible for test code to block a write and thus snapshot completion
*/
public static volatile CountDownLatch m_simulateBlockedWrite = null;
public static volatile boolean m_simulateFullDiskWritingHeader = false;
public static volatile boolean m_simulateFullDiskWritingChunk = false;
private final File m_file;
private final FileChannel m_channel;
private final FileOutputStream m_fos;
private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT");
private Runnable m_onCloseHandler = null;
/*
* If a write fails then this snapshot is hosed.
* Set the flag so all writes return immediately. The system still
* needs to scan all the tables to clear the dirty bits
* so the process continues as if the writes are succeeding.
* A more efficient failure mode would do the scan but not the
* extra serialization work.
*/
private volatile boolean m_writeFailed = false;
private volatile IOException m_writeException = null;
private volatile IOException m_reportedSerializationFailure = null;
private volatile long m_bytesWritten = 0;
private static final Semaphore m_bytesAllowedBeforeSync = new Semaphore((1024 * 1024) * 256);
private final AtomicInteger m_bytesWrittenSinceLastSync = new AtomicInteger(0);
private final ScheduledFuture> m_syncTask;
/*
* Accept a single write even though simulating a full disk is enabled;
*/
private volatile boolean m_acceptOneWrite = false;
private boolean m_needsFinalClose = true;
@SuppressWarnings("unused")
private final String m_tableName;
private final AtomicInteger m_outstandingWriteTasks = new AtomicInteger(0);
private final ReentrantLock m_outstandingWriteTasksLock = new ReentrantLock();
private final Condition m_noMoreOutstandingWriteTasksCondition =
m_outstandingWriteTasksLock.newCondition();
private static final ListeningExecutorService m_es = CoreUtils.getListeningSingleThreadExecutor("Snapshot write service ");
static final ListeningScheduledExecutorService m_syncService = MoreExecutors.listeningDecorator(
Executors.newSingleThreadScheduledExecutor(CoreUtils.getThreadFactory("Snapshot sync service")));
public static final int SNAPSHOT_SYNC_FREQUENCY = Integer.getInteger("SNAPSHOT_SYNC_FREQUENCY", 500);
public static final int SNAPSHOT_FADVISE_BYTES = Integer.getInteger("SNAPSHOT_FADVISE_BYTES", 1024 * 1024 * 2);
public static final int SNAPSHOT_RATELIMIT_MEGABYTES;
public static final boolean USE_SNAPSHOT_RATELIMIT;
static {
int limit = Integer.getInteger("SNAPSHOT_RATELIMIT_MEGABYTES", Integer.MAX_VALUE);
if (limit < 1) {
SNAP_LOG.warn("Invalid snapshot rate limit " + limit + ", no limit will be applied");
SNAPSHOT_RATELIMIT_MEGABYTES = Integer.MAX_VALUE;
} else {
SNAPSHOT_RATELIMIT_MEGABYTES = limit;
}
if (SNAPSHOT_RATELIMIT_MEGABYTES < Integer.MAX_VALUE) {
USE_SNAPSHOT_RATELIMIT = true;
SNAP_LOG.info("Rate limiting snapshots to " + SNAPSHOT_RATELIMIT_MEGABYTES + " megabytes/second");
} else {
USE_SNAPSHOT_RATELIMIT = false;
}
}
public static final UnsynchronizedRateLimiter SNAPSHOT_RATELIMITER =
UnsynchronizedRateLimiter.create(SNAPSHOT_RATELIMIT_MEGABYTES * 1024.0 * 1024.0, 1, TimeUnit.SECONDS);
public static void enforceSnapshotRateLimit(int permits) {
if (USE_SNAPSHOT_RATELIMIT) {
SNAPSHOT_RATELIMITER.acquire(permits);
}
}
public DefaultSnapshotDataTarget(
final File file,
final int hostId,
final String clusterName,
final String databaseName,
final String tableName,
final int numPartitions,
final boolean isReplicated,
final List partitionIds,
final VoltTable schemaTable,
final long txnId,
final long timestamp) throws IOException {
this(
file,
hostId,
clusterName,
databaseName,
tableName,
numPartitions,
isReplicated,
partitionIds,
schemaTable,
txnId,
timestamp,
new int[] { 0, 0, 0, 2 });
}
public DefaultSnapshotDataTarget(
final File file,
final int hostId,
final String clusterName,
final String databaseName,
final String tableName,
final int numPartitions,
final boolean isReplicated,
final List partitionIds,
final VoltTable schemaTable,
final long txnId,
final long timestamp,
int version[]
) throws IOException {
String hostname = CoreUtils.getHostnameOrAddress();
m_file = file;
m_tableName = tableName;
m_fos = new FileOutputStream(file);
m_channel = m_fos.getChannel();
m_needsFinalClose = !isReplicated;
final FastSerializer fs = new FastSerializer();
fs.writeInt(0);//CRC
fs.writeInt(0);//Header length placeholder
fs.writeByte(1);//Indicate the snapshot was not completed, set to true for the CRC calculation, false later
for (int ii = 0; ii < 4; ii++) {
fs.writeInt(version[ii]);//version
}
JSONStringer stringer = new JSONStringer();
byte jsonBytes[] = null;
try {
stringer.object();
stringer.keySymbolValuePair("txnId", txnId);
stringer.keySymbolValuePair("hostId", hostId);
stringer.keySymbolValuePair("hostname", hostname);
stringer.keySymbolValuePair("clusterName", clusterName);
stringer.keySymbolValuePair("databaseName", databaseName);
stringer.keySymbolValuePair("tableName", tableName.toUpperCase());
stringer.keySymbolValuePair("isReplicated", isReplicated);
stringer.keySymbolValuePair("isCompressed", true);
stringer.keySymbolValuePair("checksumType", "CRC32C");
stringer.keySymbolValuePair("timestamp", timestamp);
/*
* The timestamp string is for human consumption, automated stuff should use
* the actual timestamp
*/
stringer.keySymbolValuePair("timestampString", SnapshotUtil.formatHumanReadableDate(timestamp));
if (!isReplicated) {
stringer.key("partitionIds").array();
for (int partitionId : partitionIds) {
stringer.value(partitionId);
}
stringer.endArray();
stringer.keySymbolValuePair("numPartitions", numPartitions);
}
stringer.endObject();
String jsonString = stringer.toString();
JSONObject jsonObj = new JSONObject(jsonString);
jsonString = jsonObj.toString(4);
jsonBytes = jsonString.getBytes("UTF-8");
} catch (Exception e) {
throw new IOException(e);
}
fs.writeInt(jsonBytes.length);
fs.write(jsonBytes);
final BBContainer container = fs.getBBContainer();
container.b().position(4);
container.b().putInt(container.b().remaining() - 4);
container.b().position(0);
final byte schemaBytes[];
schemaBytes = PrivateVoltTableFactory.getSchemaBytes(schemaTable);
final PureJavaCrc32 crc = new PureJavaCrc32();
ByteBuffer aggregateBuffer = ByteBuffer.allocate(container.b().remaining() + schemaBytes.length);
aggregateBuffer.put(container.b());
container.discard();
aggregateBuffer.put(schemaBytes);
aggregateBuffer.flip();
crc.update(aggregateBuffer.array(), 4, aggregateBuffer.capacity() - 4);
final int crcValue = (int) crc.getValue();
aggregateBuffer.putInt(crcValue).position(8);
aggregateBuffer.put((byte)0).position(0);//Haven't actually finished writing file
if (m_simulateFullDiskWritingHeader) {
m_writeException = new IOException("Disk full");
m_writeFailed = true;
m_fos.close();
throw m_writeException;
}
/*
* Be completely sure the write succeeded. If it didn't
* the disk is probably full or the path is bunk etc.
*/
m_acceptOneWrite = true;
ListenableFuture> writeFuture =
write(Callables.returning(DBBPool.wrapBB(aggregateBuffer)), false);
try {
writeFuture.get();
} catch (InterruptedException e) {
m_fos.close();
throw new java.io.InterruptedIOException();
} catch (ExecutionException e) {
m_fos.close();
throw m_writeException;
}
if (m_writeFailed) {
m_fos.close();
throw m_writeException;
}
ScheduledFuture> syncTask = null;
syncTask = m_syncService.scheduleAtFixedRate(new Runnable() {
private long fadvisedBytes = 0;
private long syncedBytes = 0;
@Override
public void run() {
//Only sync for at least 4 megabyte of data, enough to amortize the cost of seeking
//on ye olden platters. Since we are appending to a file it's actually 2 seeks.
while (m_bytesWrittenSinceLastSync.get() > (1024 * 1024 * 4)) {
final int bytesSinceLastSync = m_bytesWrittenSinceLastSync.getAndSet(0);
long positionAtSync = 0;
try {
positionAtSync = m_channel.position();
final long syncStart = syncedBytes;
syncedBytes = Bits.sync_file_range(SNAP_LOG, m_fos.getFD(), m_channel, syncStart, positionAtSync);
} catch (IOException e) {
if (!(e instanceof java.nio.channels.AsynchronousCloseException )) {
SNAP_LOG.error("Error syncing snapshot", e);
} else {
SNAP_LOG.debug("Asynchronous close syncing snasphot data, presumably graceful", e);
}
}
m_bytesAllowedBeforeSync.release(bytesSinceLastSync);
/*
* Don't pollute the page cache with snapshot data, use fadvise
* to periodically request the kernel drop pages we have written
*/
try {
if (positionAtSync - fadvisedBytes > SNAPSHOT_FADVISE_BYTES) {
//Get aligned start and end position
final long fadviseStart = fadvisedBytes;
//-1 because we don't want to drop the last page because
//we might modify it while appending
fadvisedBytes = ((positionAtSync / Bits.pageSize()) - 1) * Bits.pageSize();
final long retval = PosixAdvise.fadvise(
m_fos.getFD(),
fadviseStart,
fadvisedBytes - fadviseStart,
PosixAdvise.POSIX_FADV_DONTNEED );
if (retval != 0) {
SNAP_LOG.error("Error fadvising snapshot data: " + retval);
SNAP_LOG.error(
"Params offset " + fadviseStart +
" length " + (fadvisedBytes - fadviseStart));
}
}
} catch (Throwable t) {
SNAP_LOG.error("Error fadvising snapshot data", t);
}
}
}
}, SNAPSHOT_SYNC_FREQUENCY, SNAPSHOT_SYNC_FREQUENCY, TimeUnit.MILLISECONDS);
m_syncTask = syncTask;
}
@Override
public void reportSerializationFailure(IOException ex) {
m_reportedSerializationFailure = ex;
}
@Override
public boolean needsFinalClose()
{
return m_needsFinalClose;
}
@Override
public void close() throws IOException, InterruptedException {
try {
m_outstandingWriteTasksLock.lock();
try {
while (m_outstandingWriteTasks.get() > 0) {
m_noMoreOutstandingWriteTasksCondition.await();
}
} finally {
m_outstandingWriteTasksLock.unlock();
}
m_syncTask.cancel(false);
ListenableFuture> task = m_syncService.submit(new Runnable() {
@Override
public void run() {
// Empty task to wait on 'cancel' above, since m_syncTask.get()
// will immediately throw a CancellationException
}
});
try {
task.get();
} catch (ExecutionException e) {
SNAP_LOG.error("Error waiting on snapshot sync task cancellation", e);
}
m_channel.force(false);
} finally {
m_bytesAllowedBeforeSync.release(m_bytesWrittenSinceLastSync.getAndSet(0));
}
m_channel.position(8);
ByteBuffer completed = ByteBuffer.allocate(1);
if (m_writeFailed || m_reportedSerializationFailure != null) {
completed.put((byte)0).flip();
} else {
completed.put((byte)1).flip();
}
m_channel.write(completed);
m_channel.force(false);
m_channel.close();
if (m_onCloseHandler != null) {
m_onCloseHandler.run();
}
if (m_reportedSerializationFailure != null) {
// There was an error reported by the EE during serialization
throw m_reportedSerializationFailure;
}
}
@Override
public int getHeaderSize() {
return 0;
}
/*
* Prepend length is basically synonymous with writing actual tuple data and not
* the header.
*/
private ListenableFuture> write(final Callable tupleDataC, final boolean prependLength) {
/*
* Unwrap the data to be written. For the traditional
* snapshot data target this should be a noop.
*/
BBContainer tupleDataTemp;
try {
tupleDataTemp = tupleDataC.call();
/*
* Can be null if the dedupe filter nulled out the buffer
*/
if (tupleDataTemp == null) {
return Futures.immediateFuture(null);
}
} catch (Throwable t) {
return Futures.immediateFailedFuture(t);
}
final BBContainer tupleDataCont = tupleDataTemp;
if (m_writeFailed) {
tupleDataCont.discard();
return null;
}
ByteBuffer tupleData = tupleDataCont.b();
m_outstandingWriteTasks.incrementAndGet();
Future compressionTask = null;
if (prependLength) {
BBContainer cont =
DBBPool.allocateDirectAndPool(SnapshotSiteProcessor.m_snapshotBufferCompressedLen);
//Skip 4-bytes so the partition ID is not compressed
//That way if we detect a corruption we know what partition is bad
tupleData.position(tupleData.position() + 4);
/*
* Leave 12 bytes, it's going to be a 4-byte length prefix, a 4-byte partition id,
* and a 4-byte CRC32C of just the header bytes, in addition to the compressed payload CRC
* that is 16 bytes, but 4 of those are done by CompressionService
*/
cont.b().position(12);
compressionTask = CompressionService.compressAndCRC32cBufferAsync(tupleData, cont);
}
final Future compressionTaskFinal = compressionTask;
ListenableFuture> writeTask = m_es.submit(new Callable