All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.journalkeeper.core.journal.Journal Maven / Gradle / Ivy

There is a newer version: 0.1.11
Show newest version
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.journalkeeper.core.journal;


import io.journalkeeper.core.api.JournalEntry;
import io.journalkeeper.core.api.JournalEntryParser;
import io.journalkeeper.core.api.RaftJournal;
import io.journalkeeper.core.exception.JournalException;
import io.journalkeeper.exceptions.IndexOverflowException;
import io.journalkeeper.exceptions.IndexUnderflowException;
import io.journalkeeper.persistence.BufferPool;
import io.journalkeeper.persistence.JournalPersistence;
import io.journalkeeper.persistence.PersistenceFactory;
import io.journalkeeper.persistence.TooManyBytesException;
import io.journalkeeper.utils.ThreadSafeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.Flushable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * @author LiYue
 * Date: 2019-03-15
 */
public class Journal implements RaftJournal, Flushable, Closeable {
    private static final Logger logger = LoggerFactory.getLogger(Journal.class);
    private static final String PARTITION_PATH = "index";
    private static final String INDEX_PATH = "index/all";
    public static final int INDEX_STORAGE_SIZE = Long.BYTES;
    private static final String JOURNAL_PROPERTIES_PATTERN = "^persistence\\.journal\\.(.*)$";
    private static final String INDEX_PROPERTIES_PATTERN = "^persistence\\.index\\.(.*)$";
    private static final Properties DEFAULT_JOURNAL_PROPERTIES = new Properties();
    private static final Properties DEFAULT_INDEX_PROPERTIES = new Properties();

    private final AtomicLong commitIndex = new AtomicLong(0L);
    // 写入index时转换用的缓存
    private final byte [] indexBytes = new byte [INDEX_STORAGE_SIZE];
    private final ByteBuffer indexBuffer = ByteBuffer.wrap(indexBytes);

    // 写入index时转换用的缓存
    private final byte [] commitIndexBytes = new byte [INDEX_STORAGE_SIZE];
    private final ByteBuffer commitIndexBuffer = ByteBuffer.wrap(commitIndexBytes);

    static {
        DEFAULT_JOURNAL_PROPERTIES.put("file_data_size", String.valueOf(128 * 1024 * 1024));
        DEFAULT_JOURNAL_PROPERTIES.put("cached_file_core_count", String.valueOf(3));
        DEFAULT_JOURNAL_PROPERTIES.put("cached_file_max_count", String.valueOf(10));
        DEFAULT_JOURNAL_PROPERTIES.put("max_dirty_size", String.valueOf(128 * 1024 * 1024));
        DEFAULT_INDEX_PROPERTIES.put("file_data_size", String.valueOf(512 * 1024));
        DEFAULT_INDEX_PROPERTIES.put("cached_file_core_count", String.valueOf(10));
        DEFAULT_INDEX_PROPERTIES.put("cached_file_max_count", String.valueOf(20));
    }

    private final JournalPersistence indexPersistence;
    private final JournalPersistence journalPersistence;
    private final Map partitionMap;
    private final PersistenceFactory persistenceFactory;
    private final BufferPool bufferPool;

    private Path basePath = null;
    private Properties indexProperties;
    private ReadWriteLock readWriteLock = new ReentrantReadWriteLock();
    private final JournalEntryParser journalEntryParser;

    public Journal(PersistenceFactory persistenceFactory, BufferPool bufferPool, JournalEntryParser journalEntryParser) {
        this.indexPersistence = persistenceFactory.createJournalPersistenceInstance();
        this.journalPersistence = persistenceFactory.createJournalPersistenceInstance();
        this.persistenceFactory = persistenceFactory;
        this.journalEntryParser = journalEntryParser;
        this.partitionMap = new ConcurrentHashMap<>();
        this.bufferPool = bufferPool;
    }

    @Override
    public long minIndex() {
        return indexPersistence.min() / INDEX_STORAGE_SIZE;
    }

    @Override
    public long maxIndex() {
        return indexPersistence.max() / INDEX_STORAGE_SIZE;
    }

    @Override
    public long minIndex(int partition) {
        return getPartitionPersistence(partition).min() / INDEX_STORAGE_SIZE;
    }

    @Override
    public long maxIndex(int partition) {
        return getPartitionPersistence(partition).max() / INDEX_STORAGE_SIZE;
    }

    /**
     * 删除给定索引位置之前的数据。
     * 不保证给定位置之前的数据全都被物理删除。
     * 保证给定位置(含)之后的数据不会被删除。
     *
     * @param journalSnapshot 给定最小安全索引位置。
     * @throws IOException 当发生IO异常时抛出
     */
    public void compact(JournalSnapshot journalSnapshot) throws IOException{

        if (journalSnapshot.minIndex() <= minIndex()) {
            return;
        }

        if (journalSnapshot.minIndex() > commitIndex()) {
            throw new IllegalArgumentException(
                    String.format("Given snapshot min index %s should less than commit index %s!",
                            ThreadSafeFormat.formatWithComma(journalSnapshot.minIndex()),
                            ThreadSafeFormat.formatWithComma(commitIndex())
                    )
            );
        }

        // 删除分区索引
        Map partitionMinIndices = journalSnapshot.partitionMinIndices();
        synchronized (partitionMap) {


            Iterator> iterator = partitionMap.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry entry = iterator.next();
                int partition = entry.getKey();
                JournalPersistence partitionPersistence = entry.getValue();

                if (partitionMinIndices.containsKey(partition)) {
                    long minPartitionIndices = partitionMinIndices.get(entry.getKey());
                    partitionPersistence.compact(minPartitionIndices * INDEX_STORAGE_SIZE);
                } else {
                    journalPersistence.close();
                    journalPersistence.delete();
                    iterator.remove();
                }

            }

            for (Map.Entry entry : partitionMinIndices.entrySet()) {
                int partition = entry.getKey();
                if (!partitionMap.containsKey(partition)) {
                    JournalPersistence partitionPersistence = persistenceFactory.createJournalPersistenceInstance();
                    partitionPersistence.recover(basePath.resolve(PARTITION_PATH).resolve(String.valueOf(partition)),
                            partitionMinIndices.get(partition) * INDEX_STORAGE_SIZE,
                            indexProperties);
                    partitionMap.put(partition, partitionPersistence);
                }
            }
        }

        // 删除全局索引
        indexPersistence.compact(journalSnapshot.minIndex() * INDEX_STORAGE_SIZE);

        // 删除Journal
        journalPersistence.compact(journalSnapshot.minOffset());

    }


    /**
     * 追加写入StorageEntry
     * @param entry 待写入的entry
     * @return entry写入后当前最大全局索引序号
     */
    public long append(JournalEntry entry) {


        // 记录当前最大位置,也是写入的Journal的offset
        long offset = journalPersistence.max();
        byte [] serializedEntry = entry.getSerializedBytes();
        try {
            // 写入Journal header
            journalPersistence.append(serializedEntry);

            // 写入全局索引
            indexBuffer.clear();
            indexBuffer.putLong(offset);
            indexPersistence.append(indexBytes);

        }  catch (IOException e) {
            throw new JournalException(e);
        }

        return maxIndex();
    }

    /**
     * 清空整个Journal
     *
     * @param snapshot 清空后Journal的起始位置
     * @throws IOException 发生IO异常时抛出
     */
    public void clear(JournalSnapshot snapshot) throws IOException {
        commitIndex.set(snapshot.minIndex());
        Map partitionMinIndices = snapshot.partitionMinIndices();
        synchronized (partitionMap) {


            Iterator> iterator = partitionMap.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry entry = iterator.next();
                int partition = entry.getKey();
                JournalPersistence partitionPersistence = entry.getValue();
                journalPersistence.close();
                journalPersistence.delete();
                iterator.remove();
            }

            for (Map.Entry entry : partitionMinIndices.entrySet()) {
                int partition = entry.getKey();
                JournalPersistence partitionPersistence = persistenceFactory.createJournalPersistenceInstance();
                partitionPersistence.recover(basePath.resolve(PARTITION_PATH).resolve(String.valueOf(partition)),
                        partitionMinIndices.get(partition) * INDEX_STORAGE_SIZE,
                        indexProperties);
                partitionMap.put(partition, partitionPersistence);
            }
        }
    }

    /**
     * 批量追加写入StorageEntry
     * @param entries 待写入的批量entry
     * @return 每条entry写入后当前最大全局索引序号
     */
    public List append(List entries) {


        // 记录当前最大位置,也是写入的Journal的offset
        long offset = journalPersistence.max();
        long index = maxIndex();
        ByteBuffer entryBuffer = ByteBuffer.wrap(new byte[entries.stream().mapToInt(e -> e.getSerializedBytes().length).sum()]);
        ByteBuffer indicesBuffer = ByteBuffer.wrap(new byte[entries.size() * INDEX_STORAGE_SIZE]);
        List indices = new ArrayList<>(entries.size());
        for (JournalEntry entry : entries) {

            entryBuffer.put(entry.getSerializedBytes());
            indicesBuffer.putLong(offset);
            offset += entry.getLength();
            indices.add(++index);
        }

        try {
            // 写入Journal header
            journalPersistence.append(entryBuffer.array());

            // 写入全局索引
            indexPersistence.append(indicesBuffer.array());

        }  catch (IOException e) {
            throw new JournalException(e);
        }

        return indices;
    }

    // Not thread safe !

    /**
     * 提交journal,已提交的journal不可变。
     * @param index 提交全局索引序号
     * @throws IOException 发生IO异常时抛出
     */
    public void commit(long index) throws IOException {
        long finalCommitIndex;
        while ((finalCommitIndex = commitIndex.get()) < index &&
                finalCommitIndex < maxIndex() &&
                commitIndex.compareAndSet(finalCommitIndex, finalCommitIndex + 1)) {
            long offset = readOffset(finalCommitIndex);

            JournalEntry header = readEntryHeaderByOffset(offset);
            commitIndexBuffer.clear();
            commitIndexBuffer.putLong(offset);
            appendPartitionIndex(commitIndexBytes, header.getPartition(), header.getBatchSize());
        }
    }

    private JournalEntry readEntryHeaderByOffset(long offset) {
        try {
            byte [] headerBytes = journalPersistence.read(offset, journalEntryParser.headerLength());
            return journalEntryParser.parseHeader(headerBytes);
        } catch (IOException e) {
            throw new JournalException(e);
        }
    }

    /**
     * 获取当前提交的全局索引序号
     * @return 当前提交的全局索引序号
     */
    public long commitIndex() {
        return commitIndex.get();
    }

    private void appendPartitionIndex(byte [] offset, int partition, int batchSize) throws IOException {
        // Create partition which not exists
        if(!partitionMap.containsKey(partition)) {
            addPartition(partition, 0L);
        }
        JournalPersistence partitionPersistence = getPartitionPersistence(partition);
        if(batchSize > 1) {
            byte[] bytes = new byte[batchSize * INDEX_STORAGE_SIZE];
            ByteBuffer pb = ByteBuffer.wrap(bytes);
            int j = 0;
            while (j < batchSize) {
                if (j == 0) {
                    pb.put(offset);
                } else {
                    pb.putLong(-1 * j);
                }
                j++;
            }
            partitionPersistence.append(bytes);
        } else {
            partitionPersistence.append(offset);
        }
    }


    private JournalPersistence getPartitionPersistence(int partition) {
        JournalPersistence partitionPersistence = partitionMap.get(partition);
        if(null == partitionPersistence) {
            throw new NosuchPartitionException(partition);
        }
        return partitionPersistence;
    }
    /**
     * 批量追加写入序列化之后的StorageEntry,用户RAFT复制时,FOLLOWER写入从LEADER复制过来的entries
     * @param storageEntries 从LEADER复制过来的entries
     * @return 写入后的最大全局索引序号
     */
    public long appendBatchRaw(List storageEntries) {
        // 计算索引
        long [] offsets = new long[storageEntries.size()];
        long offset = journalPersistence.max();
        for (int i = 0; i < offsets.length; i++) {
            offsets[i] = offset;
            offset += storageEntries.get(i).length;
        }


        try {
            // 写入Journal
            for (byte [] storageEntry : storageEntries) {
                journalPersistence.append(storageEntry);
            }

            // 写入全局索引
            int indexBufferLength = INDEX_STORAGE_SIZE * storageEntries.size();
            ByteBuffer indexBuffer = ByteBuffer.allocate(indexBufferLength);

            for (long index: offsets) {
                indexBuffer.putLong(index);
            }
            indexBuffer.flip();
            try {
                indexPersistence.append(indexBuffer.array());
            } catch (TooManyBytesException e) {
                // 如果批量写入超长,改为单条写入
                ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES);
                for (long index: offsets) {
                    buffer.putLong(0, index);
                    indexPersistence.append(buffer.array());
                }
            }

            // 在commit()提交时才写入分区索引
//            appendPartitionIndices(storageEntries, offsets);

        }  catch (IOException e) {
            throw new JournalException(e);
        }

        return maxIndex();
    }

    @Override
    public JournalEntry readByPartition(int partition, long index) {
        JournalPersistence pp = getPartitionPersistence(partition);
        long offset = readOffset(pp, index);
        long journalOffset;
        int relIndex;
        if(offset < 0) {
            journalOffset = readOffset(pp , index + offset);
            relIndex = (int)(-1 * offset);
        } else {
            journalOffset = offset;
            relIndex = 0;
        }

        JournalEntry journal = readByOffset(journalOffset);
        journal.setOffset(relIndex);
        return journal;

    }

    @Override
    public List batchReadByPartition(int partition, long startPartitionIndex, int maxSize) {
        List list = new LinkedList<>();
        int size = 0;
        long index = startPartitionIndex;
        while (size < maxSize) {
            JournalEntry batchEntry = readByPartition(partition, index);
            int count = batchEntry.getBatchSize() - batchEntry.getOffset();
            size += count ;
            index += count;
            list.add(batchEntry);
        }
        return list;
    }


    public JournalEntry read(long index){
        return journalEntryParser.parse(readRaw(index));
    }

    private JournalEntry readByOffset(long offset) {
        return journalEntryParser.parse(readRawByOffset(offset));
    }

    private byte [] readRaw(long index){
        readWriteLock.readLock().lock();
        checkIndex(index);
        try {
            long offset = readOffset(index);
            return readRawByOffset(offset);
        } finally {
            readWriteLock.readLock().unlock();
        }
    }

    private byte [] readRawByOffset(long offset) {
        readWriteLock.readLock().lock();
        try {
            int length = readEntryLengthByOffset(offset);
            return journalPersistence
                    .read(offset, length);
        } catch (IOException e) {
            throw new JournalException(e);
        } finally {
            readWriteLock.readLock().unlock();
        }
    }
    private int readEntryLengthByOffset(long offset) {
        return readEntryHeaderByOffset(offset).getLength();
    }

    public long readOffset(long index) {
        return readOffset(indexPersistence, index);
    }
    private long readOffset(JournalPersistence indexPersistence , long index) {
        try {
            byte [] indexBytes = indexPersistence.read(index * INDEX_STORAGE_SIZE, INDEX_STORAGE_SIZE);
            return ByteBuffer.wrap(indexBytes).getLong();
        } catch (IOException e) {
            throw new JournalException(e);
        }
    }

    public List batchRead(long index, int size) {
        checkIndex(index);
        List list = new ArrayList<>(size);
        long i = index;
        while (list.size() < size && i < maxIndex()) {
            list.add(read(i++));
        }
        return list;
    }


    /**
     * 批量读取StorageEntry
     * @param index 起始索引位置
     * @param size 期望读取的条数
     * @return 未反序列化的StorageEntry列表。
     * @throws IndexUnderflowException 如果 index 小于 minIndex()
     * @throws IndexOverflowException 如果index 不小于 maxIndex()
     */
    public List readRaw(long index, int size) {
        checkIndex(index);
        List list = new ArrayList<>(size);
        long i = index;
        while (list.size() < size && i < maxIndex()) {
            list.add(readRaw(i++));
        }
        return list;
    }
    /**
     * 读取指定索引位置上Entry的Term。
     * @param index 索引位置。
     * @return Term 任期
     * @throws IndexUnderflowException 如果 index 小于 minIndex()
     * @throws IndexOverflowException 如果index 不小于 maxIndex()
     */
    public int getTerm(long index) {
        readWriteLock.readLock().lock();
        try {
            if(index == -1) return -1;
            checkIndex(index);
            long offset = readOffset(index);
            return readEntryHeaderByOffset(offset).getTerm();
        } finally {
            readWriteLock.readLock().unlock();
        }
    }



    /**
     * 从index位置开始:
     * 如果一条已经存在的日志与新的冲突(index 相同但是任期号 term 不同),则删除已经存在的日志和它之后所有的日志
     * 添加任何在已有的日志中不存在的条目。
     * @param rawEntries 待比较的日志
     * @param startIndex 起始位置
     * @throws IndexUnderflowException 如果 startIndex 小于 minIndex()
     * @throws IndexOverflowException 如果 startIndex 不小于 maxIndex()
     */
    public void compareOrAppendRaw(List rawEntries, long startIndex) {


        List entries = rawEntries.stream()
                .map(journalEntryParser::parse)
                .collect(Collectors.toList());
        try {
            long index = startIndex;
            for (int i = 0; i < entries.size(); i++, index++) {
                if (index < maxIndex() && getTerm(index) != entries.get(i).getTerm()) {
                    readWriteLock.writeLock().lock();
                    try {
                        truncate(index);
                    } finally {
                        readWriteLock.writeLock().unlock();
                    }
                }
                if (index == maxIndex()) {
                    appendBatchRaw(rawEntries.subList(i, entries.size()));
                    break;
                }
            }
        } catch (IOException e) {
            throw new JournalException(e);
        }
    }

    /**
     * 从索引index位置开始,截掉后面的数据
     * @param index 截取全局索引位置
     * @throws IndexUnderflowException 如果 index < minIndex()
     * @throws IndexOverflowException 如果 index >= maxIndex()
     */
    private void truncate(long index) throws IOException {

        long journalOffset = readOffset(index);
        truncatePartitions(journalOffset);
        indexPersistence.truncate(index * INDEX_STORAGE_SIZE);
        journalPersistence.truncate(journalOffset);

        // 安全更新commitIndex
        long finalCommitIndex;
        while ((finalCommitIndex = commitIndex.get()) > maxIndex() &&
                !commitIndex.compareAndSet(finalCommitIndex, maxIndex())) {
            Thread.yield();
        }

    }

    private void truncatePartitions(long journalOffset) throws IOException {
        for (JournalPersistence partitionPersistence : partitionMap.values()) {
            long position = partitionPersistence.max() - INDEX_STORAGE_SIZE;
            while (position > partitionPersistence.min()) {
                long offset = readOffset(partitionPersistence, position / INDEX_STORAGE_SIZE );
                if(offset < journalOffset) {
                    break;
                }
                position -= INDEX_STORAGE_SIZE;
            }
            partitionPersistence.truncate(position <= partitionPersistence.min()? 0L : position + INDEX_STORAGE_SIZE);
        }
    }

    /**
     * 从指定path恢复Journal。
     * 1. 删除journal或者index文件末尾可能存在的不完整的数据。
     * 2. 以Journal为准,修复全局索引和分区索引:删除多余的索引,并创建缺失的索引。
     * @param path 恢复目录
     * @param commitIndex 当前journal提交全局索引
     * @param journalSnapshot 快照
     * @param properties 属性
     * @throws IOException 发生IO异常时抛出
     */
    public void recover(Path path, long commitIndex, JournalSnapshot journalSnapshot, Properties properties) throws IOException {
        this.basePath = path;
        Path indexPath = path.resolve(INDEX_PATH);
        Path partitionPath = path.resolve(PARTITION_PATH);
        Properties journalProperties = replacePropertiesNames(properties,
                JOURNAL_PROPERTIES_PATTERN, DEFAULT_JOURNAL_PROPERTIES);
        journalPersistence.recover(path, journalSnapshot.minOffset(), journalProperties);
        // 截掉末尾半条数据
        truncateJournalTailPartialEntry();

        indexProperties = replacePropertiesNames(properties,
                INDEX_PROPERTIES_PATTERN, DEFAULT_INDEX_PROPERTIES);
        indexPersistence.recover(indexPath, journalSnapshot.minIndex() * INDEX_STORAGE_SIZE, indexProperties);
        // 截掉末尾半条数据
        indexPersistence.truncate(indexPersistence.max() - indexPersistence.max() % INDEX_STORAGE_SIZE);

        // 删除多余的索引
        truncateExtraIndices();

        // 创建缺失的索引
        buildMissingIndices();

        checkAndSetCommitIndex(commitIndex);

        // 恢复分区索引
        recoverPartitions(partitionPath, journalSnapshot.partitionMinIndices(), indexProperties);

        flush();
        logger.info("Journal recovered, minIndex: {}, maxIndex: {}, partitions: {}, path: {}.",
                minIndex(), maxIndex(), partitionMap.keySet(), path.toAbsolutePath().toString());
    }

    private void checkAndSetCommitIndex(long commitIndex) {
        this.commitIndex.set(commitIndex);
        if(this.commitIndex.get() < minIndex() ) {
            logger.info("Journal commitIndex {} should be not less than minIndex {}, set commitIndex to {}.",
                    this.commitIndex.get(), minIndex(), minIndex());
            this.commitIndex.set(minIndex());
        } else if ( this.commitIndex.get() > maxIndex()) {
            logger.info("Journal commitIndex {} should be not greater than maxIndex {}, set commitIndex to {}.",
                    this.commitIndex.get(), maxIndex(), maxIndex());
            this.commitIndex.set(maxIndex());
        }
    }

    private void recoverPartitions(Path partitionPath, Map partitionIndices, Properties properties) throws IOException {


        Map lastIndexedOffsetMap = new HashMap<>(partitionIndices.size());
        for (Map.Entry entry : partitionIndices.entrySet()) {
            int partition = entry.getKey();
            long lastIncludedIndex = entry.getValue();
            JournalPersistence pp = persistenceFactory.createJournalPersistenceInstance();
            pp.recover(partitionPath.resolve(String.valueOf(partition)), lastIncludedIndex * INDEX_STORAGE_SIZE, properties);
            // 截掉末尾半条数据
            pp.truncate(pp.max() - pp.max() % INDEX_STORAGE_SIZE);
            truncateTailPartialBatchIndices(pp);

            partitionMap.put(partition, pp);
            lastIndexedOffsetMap.put(partition, getLastIndexedOffset(pp));
        }

        // 重建缺失的分区索引
        long offset = lastIndexedOffsetMap.values().stream().mapToLong(l -> l).min().orElse(journalPersistence.min());

        // 只创建已提交的分区索引
        long commitOffset = commitIndex.get() == maxIndex() ? journalPersistence.max() :
                readOffset(commitIndex.get());

        // 创建缺失的索引
        while (offset < commitOffset) {
            JournalEntry header = readEntryHeaderByOffset(offset);
            int length = readEntryLengthByOffset(offset);
            if(offset > lastIndexedOffsetMap.get(header.getPartition())) {
                indexBuffer.clear();
                indexBuffer.putLong(offset);
                appendPartitionIndex(indexBytes, header.getPartition(), header.getBatchSize());
            }
            offset += length;
        }

        // 删除未提交部分的分区索引

        for (Integer partition : partitionIndices.keySet()) {
            JournalPersistence partitionPersistence = partitionMap.get(partition);
            long partitionIndex = partitionPersistence.max() / INDEX_STORAGE_SIZE - 1;
            while (partitionIndex * INDEX_STORAGE_SIZE >= partitionPersistence.min()) {
                long journalOffset = readOffset(partitionPersistence, partitionIndex);
                if (journalOffset < commitOffset) {
                    break;
                }
                partitionIndex--;
            }

            partitionIndex += 1;
            partitionPersistence.truncate(partitionIndex * INDEX_STORAGE_SIZE);

        }

    }

    private long getLastIndexedOffset(JournalPersistence pp) {
        // 读出最后一条索引对应的Journal Offset
        long lastIndexedOffset = journalPersistence.min();

        if (pp.max() > 0) {
            long lastIndex = pp.max() / INDEX_STORAGE_SIZE - 1;
            long lastOffset = readOffset(pp, lastIndex);

            if (lastOffset < 0) {
                long firstIndexOfBatchEntries = lastIndex + lastOffset;
                lastIndexedOffset = readOffset(pp, firstIndexOfBatchEntries);
            } else {
                lastIndexedOffset = lastOffset;
            }
        }
        return lastIndexedOffset;
    }

    private void truncateTailPartialBatchIndices(JournalPersistence pp) throws IOException {
        if(pp.max() > pp.min()) {
            // 如果最后一条索引是批消息的索引,需要检查其完整性
            long lastIndex = pp.max() / INDEX_STORAGE_SIZE - 1;
            long lastOffset = readOffset(pp, lastIndex);
            // 如果是批消息检查这一批消息的索引的完整性,如不完整直接截掉这个批消息的已存储的所有索引
            if (lastOffset < 0) {
                long firstIndexOfBatchEntries = lastIndex + lastOffset;
                long batchEntriesOffset = readOffset(pp, firstIndexOfBatchEntries);
                JournalEntry header = readEntryHeaderByOffset(batchEntriesOffset);
                if ((lastIndex - 1) * -1 < header.getBatchSize()) {
                    pp.truncate(firstIndexOfBatchEntries * INDEX_STORAGE_SIZE);
                }
            }
        }
    }
    private Properties replacePropertiesNames(Properties properties, String fromNameRegex, Properties defaultProperties){
        Properties jp = new Properties(defaultProperties);
        properties.stringPropertyNames().forEach(k -> {
            String name = k.replaceAll(fromNameRegex, "$1");
            jp.setProperty(name, properties.getProperty(k));
        });
        return jp;
    }

    private void buildMissingIndices() throws IOException {
        // 找到最新一条没有索引位置
        long indexOffset;
        if (indexPersistence.max() - INDEX_STORAGE_SIZE >= indexPersistence.min()) {
            long offset = readOffset(indexPersistence.max() / INDEX_STORAGE_SIZE - 1);
            int length  = readEntryLengthByOffset(offset);
            indexOffset = offset + length;
        } else {
            indexOffset = journalPersistence.min();
        }

        // 创建索引
        List indices = new LinkedList<>();
        while (indexOffset < journalPersistence.max()) {
            indices.add(indexOffset);
            indexOffset += readEntryLengthByOffset(indexOffset);
        }

        // 写入索引
        if(!indices.isEmpty()) {
            ByteBuffer buffer = bufferPool.allocate(indices.size() * INDEX_STORAGE_SIZE);
            indices.forEach(buffer::putLong);
            buffer.flip();
            indexPersistence.append(buffer.array());
            bufferPool.release(buffer);
        }
    }

    private void truncateExtraIndices() throws IOException {

        long position = indexPersistence.max();
        //noinspection StatementWithEmptyBody
        while ((position -= INDEX_STORAGE_SIZE) >= indexPersistence.min() && readOffset(position / INDEX_STORAGE_SIZE) >= journalPersistence.max()) {}
        indexPersistence.truncate(position + INDEX_STORAGE_SIZE);
    }

    private void truncateJournalTailPartialEntry() throws IOException {

        // 找最后的连续2条记录

        long position = journalPersistence.max() - journalEntryParser.headerLength();
        long lastEntryPosition = -1; // 最后连续2条记录中后面那条的位置
        JournalEntry lastEntryHeader = null;
        while (position >= journalPersistence.min()) {
            try {
                JournalEntry header = readByOffset(position);
                // 找到一条记录的开头位置
                if(lastEntryPosition < 0) { // 之前是否已经找到一条?
                    // 这是倒数第一条,记录之
                    lastEntryPosition = position;
                    lastEntryHeader = header;
                    if(lastEntryPosition == journalPersistence.min()) {
                        // 只有一条完整的记录也认为OK,保留之。
                        truncatePartialEntry(lastEntryPosition, lastEntryHeader);
                        return;
                    }
                } else {
                    // 这是倒数第二条
                    if(position + header.getLength() == lastEntryPosition) {
                        // 找到最后2条中位置较小的那条,并且较小那条的位置+长度==较大那条的位置
                        truncatePartialEntry(lastEntryPosition, lastEntryHeader);
                        return;
                    } else { // 之前找到的那个第一条是假的(小概率会出现Entry中恰好也有连续2个字节等于MAGIC)
                        lastEntryPosition = position;
                        lastEntryHeader = header;
                    }
                }
            } catch (Exception ignored) {}
            position --;
        }

        // 找到最小位置了,啥也没找到,直接清空所有数据。
        journalPersistence.truncate(journalPersistence.min());
    }

    private void truncatePartialEntry(long lastEntryPosition, JournalEntry lastEntryHeader) throws IOException {
        // 判断最后一条是否完整
        if(lastEntryPosition + lastEntryHeader.getLength() <= journalPersistence.max()) {
            // 完整,截掉后面的部分
            journalPersistence.truncate(lastEntryPosition + lastEntryHeader.getLength());
        } else {
            // 不完整,直接截掉这条数据
            journalPersistence.truncate(lastEntryPosition);
        }
    }

    /**
     * 将所有内存中的Journal和Index写入磁盘。
     */
    @Override
    public void flush() {
        long flushed;
        do {
            flushed = Stream.concat(Stream.of(journalPersistence, indexPersistence), partitionMap.values().stream())
                    .filter(p -> p.flushed() < p.max())
                    .peek(p -> {
                        try {
                            p.flush();
                        } catch (IOException e) {
                            logger.warn("Flush {} exception: ", p.getBasePath(), e);
                            throw new JournalException(e);
                        }
                    }).count();
        } while (flushed > 0);
    }

    public boolean isDirty() {
        return Stream.concat(Stream.of(journalPersistence, indexPersistence), partitionMap.values().stream())
            .anyMatch(p -> p.flushed() < p.max());}

    private void checkIndex(long index) {
        long position = index * INDEX_STORAGE_SIZE;
        if(position < indexPersistence.min()) {
            throw new IndexUnderflowException();
        }
        if(position >= indexPersistence.max()) {
            throw new IndexOverflowException();
        }
    }

    public void rePartition(Set partitions) {
        try {
            synchronized (partitionMap) {
                for (int partition : partitions) {
                    if (!partitionMap.containsKey(partition)) {
                        addPartition(partition, 0L);
                    }
                }

                List toBeRemoved = new ArrayList<>();
                for (Map.Entry entry : partitionMap.entrySet()) {
                    if (!partitions.contains(entry.getKey())) {
                        toBeRemoved.add(entry.getKey());
                    }
                }
                for (Integer partition : toBeRemoved) {
                    removePartition(partition);
                }
                logger.info("Journal repartitioned, partitions: {}, path: {}.",
                        partitionMap.keySet(), basePath.toAbsolutePath().toString());

            }
        } catch (IOException e) {
            throw new JournalException(e);
        }
    }


    @Override
    public long queryIndexByTimestamp(int partition, long timestamp) {
        try {
            if (partitionMap.containsKey(partition)) {
                JournalPersistence indexStore = partitionMap.get(partition);
                long searchedIndex = binarySearchByTimestamp(timestamp, indexStore, indexStore.min() / INDEX_STORAGE_SIZE, indexStore.max() / INDEX_STORAGE_SIZE - 1);

                // 考虑到有可能出现连续n条消息时间相同,找到这n条消息的第一条
                while (searchedIndex - 1 >= indexStore.min() && timestamp <= getStorageTimestamp(indexStore, searchedIndex - 1)) {
                    searchedIndex--;
                }
                return searchedIndex;

            }
        } catch (Throwable e) {
            logger.warn("Query index by timestamp exception: ", e);
        }
        return -1L;
    }

    // 折半查找
    private long binarySearchByTimestamp(long timestamp,
                                         JournalPersistence indexStore,
                                         long leftIndexInclude,
                                         long rightIndexInclude) {

        if (rightIndexInclude <= leftIndexInclude) {
            return -1L;
        }

        if (timestamp <= getStorageTimestamp(indexStore, leftIndexInclude)) {
            return leftIndexInclude;
        }

        if (timestamp >= getStorageTimestamp(indexStore, rightIndexInclude)) {
            return rightIndexInclude;
        }

        if (leftIndexInclude + 1 == rightIndexInclude) {
            return leftIndexInclude;
        }

        long mid = leftIndexInclude + (rightIndexInclude - leftIndexInclude) / 2;

        long midTimestamp = getStorageTimestamp(indexStore, mid);

        if (timestamp < midTimestamp) {
            return binarySearchByTimestamp(timestamp, indexStore, leftIndexInclude, mid);
        } else {
            return binarySearchByTimestamp(timestamp, indexStore, mid, rightIndexInclude);
        }
    }

    private long getStorageTimestamp(
            JournalPersistence indexStore,
            long index) {
        JournalEntry header = readEntryHeaderByOffset(readOffset(indexStore, index));
        return header.getTimestamp();
    }

    @Override
    public Set getPartitions() {
        return new HashSet<>(partitionMap.keySet());
    }

    public void addPartition(int partition) throws IOException {
        addPartition(partition, 0L);
    }
    public void addPartition(int partition, long minIndex) throws IOException {
        synchronized (partitionMap) {
            if (!partitionMap.containsKey(partition)) {
                JournalPersistence partitionPersistence = persistenceFactory.createJournalPersistenceInstance();
                partitionPersistence.recover(
                        basePath.resolve(PARTITION_PATH).resolve(String.valueOf(partition)),
                        minIndex * INDEX_STORAGE_SIZE,
                        indexProperties);
                partitionMap.put(partition, partitionPersistence);
            }
        }
    }

    public void removePartition(int partition) throws IOException {
        synchronized (partitionMap) {
            JournalPersistence removedPersistence;
            if ((removedPersistence = partitionMap.remove(partition)) != null) {
                logger.info("Partition removed: {}, journal: {}.", partition, basePath.toAbsolutePath().toString());
                removedPersistence.close();
                removedPersistence.delete();
            }
        }
    }

    @Override
    public void close() throws IOException {
        for (JournalPersistence persistence : partitionMap.values()) {
            persistence.close();
        }
        indexPersistence.close();
        journalPersistence.close();

    }

    // For monitor only

    public long flushedIndex() {
        return indexPersistence.flushed() / INDEX_STORAGE_SIZE;
    }

    public JournalPersistence getJournalPersistence() {
        return journalPersistence;
    }
    public JournalPersistence getIndexPersistence() {
        return indexPersistence;
    }

    public Map getPartitionMap() {
        return Collections.unmodifiableMap(partitionMap);
    }


    public long maxOffset() {
        return journalPersistence.max();
    }

    public Map calcPartitionIndices(long journalOffset) {

        Map partitionIndices = new HashMap<>(partitionMap.size());
        for (Map.Entry entry : partitionMap.entrySet()) {
            int partition = entry.getKey();
            JournalPersistence partitionPersistence = entry.getValue();
            long index = maxIndex(partition);
            while (--index >= minIndex(partition)) {
                long offset = readOffset(partitionPersistence, index);
                if (offset < journalOffset) {

                    break;
                }
            }

            partitionIndices.put(partition, index + 1);
        }
        return partitionIndices;
    }

    @Override
    public String toString() {
        return "Journal{" +
                "commitIndex=" + commitIndex +
                ", indexPersistence=" + indexPersistence +
                ", journalPersistence=" + journalPersistence +
                ", partitionMap=" + partitionMap +
                ", basePath=" + basePath +
                '}';
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy