com.hazelcast.jet.sql.impl.connector.map.MapIndexScanP Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hazelcast-sql Show documentation
SQL Parser and Optimizer
There is a newer version: 5.5.0
/*
 * Copyright 2021 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.sql.impl.connector.map;

import com.hazelcast.cluster.Address;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.core.HazelcastInstanceNotActiveException;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.internal.iteration.IndexIterationPointer;
import com.hazelcast.internal.partition.InternalPartitionService;
import com.hazelcast.internal.serialization.InternalSerializationService;
import com.hazelcast.internal.util.collection.PartitionIdSet;
import com.hazelcast.jet.core.AbstractProcessor;
import com.hazelcast.jet.core.Processor;
import com.hazelcast.jet.core.ProcessorSupplier;
import com.hazelcast.jet.impl.connector.AbstractIndexReader;
import com.hazelcast.jet.sql.impl.ExpressionUtil;
import com.hazelcast.map.impl.operation.MapFetchIndexOperation;
import com.hazelcast.map.impl.operation.MapFetchIndexOperation.MapFetchIndexOperationResult;
import com.hazelcast.map.impl.operation.MapFetchIndexOperation.MissingPartitionException;
import com.hazelcast.map.impl.operation.MapOperationProvider;
import com.hazelcast.map.impl.proxy.MapProxyImpl;
import com.hazelcast.nio.ObjectDataInput;
import com.hazelcast.nio.ObjectDataOutput;
import com.hazelcast.nio.serialization.DataSerializable;
import com.hazelcast.query.impl.QueryableEntry;
import com.hazelcast.query.impl.getters.Extractors;
import com.hazelcast.security.permission.MapPermission;
import com.hazelcast.spi.exception.TargetDisconnectedException;
import com.hazelcast.spi.exception.TargetNotMemberException;
import com.hazelcast.spi.exception.WrongTargetException;
import com.hazelcast.spi.impl.InternalCompletableFuture;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.sql.impl.exec.scan.MapIndexScanMetadata;
import com.hazelcast.sql.impl.exec.scan.MapScanRow;
import com.hazelcast.sql.impl.exec.scan.index.IndexFilter;
import com.hazelcast.sql.impl.expression.ExpressionEvalContext;
import com.hazelcast.sql.impl.row.JetSqlRow;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.security.Permission;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PrimitiveIterator;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.stream.IntStream;

import static com.hazelcast.jet.impl.util.Util.getNodeEngine;
import static com.hazelcast.query.impl.getters.GetterCache.SIMPLE_GETTER_CACHE_SUPPLIER;
import static com.hazelcast.security.permission.ActionConstants.ACTION_CREATE;
import static com.hazelcast.security.permission.ActionConstants.ACTION_READ;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.stream.Collectors.toList;

/**
 * Implementation of migration-tolerant IMap index scan processor.
 * 
 * The key component of migration-tolerant index scan is so-called 'split'.
 * It's the basic unit of execution that is attached to a concrete member
 * and there exists a single instance during normal execution.
 * 

 * Scan algorithm initially assumes that all assigned partitions are local.
 * Processor sends {@link MapFetchIndexOperation} to the local member
 * during normal execution, and to all members which own partitions
 * assigned to the processor after migration. After response is received,
 * processor saves the new `pointers` to `lastPointers`, emits to
 * processor's outbox, and sends another operation with the new pointers.
 * 

 * During partition migration processor will split the partitions in the
 * original `split` into disjoint parts, according to the owner, and retry
 * with the new owners. If all partitions in a `split` were read, the
 * `split` is removed from execution.
 */
final class MapIndexScanP extends AbstractProcessor {

    private static final long DELAY_AFTER_MISSING_PARTITION = MILLISECONDS.toNanos(100);

    private final MapIndexScanMetadata metadata;

    private HazelcastInstance hazelcastInstance;
    private ExpressionEvalContext evalContext;
    private AbstractIndexReader> reader;

    private final ArrayList splits = new ArrayList<>();
    private MapScanRow row;
    private JetSqlRow pendingItem;
    private boolean isIndexSorted;

    private MapIndexScanP(@Nonnull MapIndexScanMetadata indexScanMetadata) {
        this.metadata = indexScanMetadata;
    }

    @Override
    protected void init(@Nonnull Context context) {
        hazelcastInstance = context.hazelcastInstance();
        evalContext = ExpressionEvalContext.from(context);
        reader = new LocalMapIndexReader(hazelcastInstance, evalContext.getSerializationService(), metadata);

        int[] memberPartitions = context.processorPartitions();
        splits.add(new Split(
                new PartitionIdSet(hazelcastInstance.getPartitionService().getPartitions().size(), memberPartitions),
                hazelcastInstance.getCluster().getLocalMember().getAddress(),
                filtersToPointers(metadata.getFilter(), metadata.isDescending(), evalContext)
        ));

        row = MapScanRow.create(
                metadata.getKeyDescriptor(),
                metadata.getValueDescriptor(),
                metadata.getFieldPaths(),
                metadata.getFieldTypes(),
                Extractors.newBuilder(evalContext.getSerializationService())
                        .setGetterCacheSupplier(SIMPLE_GETTER_CACHE_SUPPLIER)
                        .build(),
                evalContext.getSerializationService()
        );
        isIndexSorted = metadata.getComparator() != null;
    }

    private static IndexIterationPointer[] filtersToPointers(
            @Nonnull IndexFilter filter,
            boolean descending,
            ExpressionEvalContext evalContext
    ) {
        return IndexIterationPointer.createFromIndexFilter(filter, descending, evalContext);
    }

    @Override
    public boolean complete() {
        return isIndexSorted ? runSortedIndex() : runHashIndex();
    }

    @Override
    public boolean closeIsCooperative() {
        return true;
    }

    private boolean runSortedIndex() {
        for (; ; ) {
            if (pendingItem != null && !tryEmit(pendingItem)) {
                return false;
            } else {
                pendingItem = null;
            }

            JetSqlRow extreme = null;
            int extremeIndex = -1;
            for (int i = 0; i < splits.size(); ++i) {
                Split split = splits.get(i);
                try {
                    split.peek();
                } catch (MissingPartitionException e) {
                    splits.addAll(splitOnMigration(split));
                    splits.remove(i--);
                    continue;
                }
                if (split.currentRow == null) {
                    if (split.done()) {
                        // No more items to read, remove finished split.
                        splits.remove(i--);
                        continue;
                    }
                    // waiting for more rows from this split
                    return false;
                }
                if (extremeIndex < 0
                        || metadata.getComparator().compare(split.currentRow, splits.get(extremeIndex).currentRow) < 0) {
                    extremeIndex = i;
                    extreme = split.currentRow;
                }
            }

            if (extremeIndex < 0) {
                assert splits.isEmpty();
                return true;
            }

            pendingItem = extreme;
            splits.get(extremeIndex).remove();
        }
    }

    private boolean runHashIndex() {
        boolean allIdle;
        do {
            allIdle = true;
            for (int i = 0; i < splits.size(); ++i) {
                Split split = splits.get(i);
                try {
                    split.peek();
                } catch (MissingPartitionException e) {
                    splits.addAll(splitOnMigration(split));
                    splits.remove(i--);
                    continue;
                }
                if (split.currentRow == null) {
                    if (split.done()) {
                        // No more items to read, remove finished split.
                        splits.remove(i--);
                        if (splits.isEmpty()) {
                            return true;
                        }
                    }
                } else {
                    allIdle = false;
                    if (tryEmit(split.currentRow)) {
                        split.remove();
                    } else {
                        return false;
                    }
                }
            }
        } while (!allIdle);

        return false;
    }

    /**
     * Perform splitting of a {@link Split} after receiving {@link MissingPartitionException}
     * or various cluster state exceptions like {@link MemberLeftException}.
     * 

     * Method gets current partition table and proceeds with following procedure:
     * 
     * It splits the partitions assigned to the split according to the new
     * partition owner into disjoint sets, one for each owner.
     *
     * @param split the split to split
     * @return collection of new split units
     */
    private List splitOnMigration(Split split) {
        IndexIterationPointer[] lastPointers = split.pointers;
        InternalPartitionService partitionService = getNodeEngine(hazelcastInstance).getPartitionService();
        Map newSplits = new HashMap<>();
        PrimitiveIterator.OfInt partitionIterator = split.partitions.intIterator();

        while (partitionIterator.hasNext()) {
            int partitionId = partitionIterator.nextInt();
            // If at least one partition owner is not assigned -- assign current member.
            // Later, a WrongTargetException will be thrown
            // and it causes this method to be called again.
            // Occasionally prediction with current member would be correct.
            Address potentialOwner = partitionService.getPartition(partitionId).getOwnerOrNull();
            Address owner = potentialOwner == null
                    ? split.owner
                    : partitionService.getPartition(partitionId).getOwnerOrNull();

            newSplits.computeIfAbsent(owner, x -> new Split(
                    new PartitionIdSet(partitionService.getPartitionCount()), owner, lastPointers)
            ).partitions.add(partitionId);
        }

        ArrayList res = new ArrayList<>(newSplits.values());
        // if the resulting split is the same as the input split, postpone retrying it
        if (res.size() == 1) {
            Split newSplit = res.get(0);
            if (newSplit.owner.equals(split.owner) && newSplit.partitions.equals(split.partitions)) {
                newSplit.postponeUntil(System.nanoTime() + DELAY_AFTER_MISSING_PARTITION);
            }
        }
        return res;
    }

    /**
     * Basic unit of index scan execution bounded to concrete member and partitions set.
     * Can be split into smaller splits after migration is detected.
     */
    private final class Split {
        private final PartitionIdSet partitions;
        private final Address owner;
        private IndexIterationPointer[] pointers;
        private List> currentBatch = emptyList();
        private JetSqlRow currentRow;
        private int currentBatchPosition;
        private CompletableFuture future;
        private long postponeUntil = Long.MIN_VALUE;

        private Split(PartitionIdSet partitions, Address owner, IndexIterationPointer[] pointers) {
            this.partitions = partitions;
            this.owner = owner;
            this.pointers = pointers;
            this.currentBatchPosition = 0;
            this.future = null;
        }

        private void postponeUntil(long postponeUntil) {
            this.postponeUntil = postponeUntil;
        }

        /**
         * After this call, {@link #currentRow} will return the next entry to emit.
         * They are set to null, if there's no row available
         * because we're either done or waiting for more data.
         */
        private void peek() {
            if (postponeUntil > Long.MIN_VALUE) {
                if (System.nanoTime() < postponeUntil) {
                    return;
                } else {
                    postponeUntil = Long.MIN_VALUE;
                }
            }

            // start a new async call, if we're not done and one isn't in flight
            if (future == null && pointers.length > 0) {
                future = reader.readBatch(owner, partitions, pointers);
            }

            // get the next batch from the future, if the current batch is done and one is in flight
            if (currentBatch.size() == currentBatchPosition && future != null && future.isDone()) {
                MapFetchIndexOperationResult result;
                try {
                    result = reader.toBatchResult(future);
                } catch (ExecutionException e) {
                    // unwrap the MissingPartitionException, and wrap other exceptions as MPE
                    Throwable t = findTopologyExceptionInCauses(e);
                    if (t != null) {
                        throw new MissingPartitionException(t.toString(), e);
                    }
                    throw new RuntimeException(e);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
                currentBatch = reader.toRecordSet(result);
                currentBatchPosition = 0;
                pointers = result.getPointers();
                future = null;
            }

            // project the next item
            while (currentRow == null && currentBatchPosition < currentBatch.size()) {
                // Sometimes scan query may not include indexed field.
                // So, additional projection is required to ability to merge-sort an output.
                currentRow = projectAndFilter(currentBatch.get(currentBatchPosition));
                if (currentRow == null) {
                    currentBatchPosition++;
                }
            }
        }

        /**
         * Returns a topology exception from the given Throwable or its causes.
         * Topology exception are those related to a member leaving the cluster.
         * We handle them the same as {@link MissingPartitionException} trigger.
         */
        @SuppressWarnings("BooleanExpressionComplexity")
        private Throwable findTopologyExceptionInCauses(Throwable t) {
            while (t != null) {
                if (t instanceof MissingPartitionException
                        || t instanceof HazelcastInstanceNotActiveException
                        || t instanceof MemberLeftException
                        || t instanceof TargetDisconnectedException
                        || t instanceof TargetNotMemberException
                        || t instanceof WrongTargetException
                ) {
                    return t;
                }
                t = t.getCause();
            }
            return null;
        }

        private JetSqlRow projectAndFilter(@Nonnull QueryableEntry entry) {
            row.setKeyValue(
                    entry.getKeyIfPresent(), entry.getKeyDataIfPresent(),
                    entry.getValueIfPresent(), entry.getValueDataIfPresent()
            );
            return ExpressionUtil.evaluate(metadata.getRemainingFilter(), metadata.getProjection(), row, evalContext);
        }

        private void remove() {
            currentBatchPosition++;
            currentRow = null;
        }

        private boolean done() {
            return currentBatchPosition == currentBatch.size() && pointers.length == 0;
        }

        @Override
        public String toString() {
            return "Split{" +
                    "partitions=" + partitions +
                    ", owner=" + owner +
                    ", hash=" + System.identityHashCode(this) +
                    '}';
        }
    }

    private static final class LocalMapIndexReader
            extends AbstractIndexReader> {

        static final int FETCH_SIZE_HINT = 128;

        private final HazelcastInstance hazelcastInstance;
        private final String indexName;

        private LocalMapIndexReader(
                @Nonnull HazelcastInstance hzInstance,
                @Nonnull InternalSerializationService serializationService,
                @Nonnull MapIndexScanMetadata indexScanMetadata
        ) {
            super(indexScanMetadata.getMapName(), MapFetchIndexOperationResult::getEntries);

            this.hazelcastInstance = hzInstance;
            this.indexName = indexScanMetadata.getIndexName();
            this.serializationService = serializationService;
        }

        @Override
        @Nonnull
        public InternalCompletableFuture readBatch(
                Address address,
                PartitionIdSet partitions,
                IndexIterationPointer[] pointers
        ) {
            MapProxyImpl mapProxyImpl = (MapProxyImpl) hazelcastInstance.getMap(objectName);
            MapOperationProvider operationProvider = mapProxyImpl.getOperationProvider();
            Operation op = operationProvider.createFetchIndexOperation(
                    mapProxyImpl.getName(),
                    indexName,
                    pointers,
                    partitions,
                    FETCH_SIZE_HINT
            );
            return mapProxyImpl.getOperationService().invokeOnTarget(mapProxyImpl.getServiceName(), op, address);
        }
    }

    static ProcessorSupplier readMapIndexSupplier(MapIndexScanMetadata indexScanMetadata) {
        return new MapIndexScanProcessorSupplier(indexScanMetadata);
    }

    private static final class MapIndexScanProcessorSupplier implements ProcessorSupplier, DataSerializable {

        private MapIndexScanMetadata metadata;

        @SuppressWarnings("unused")
        private MapIndexScanProcessorSupplier() {
        }

        private MapIndexScanProcessorSupplier(@Nonnull MapIndexScanMetadata metadata) {
            this.metadata = metadata;
        }

        @Override
        @Nonnull
        public List get(int count) {
            return IntStream.range(0, count)
                    .mapToObj(i -> new MapIndexScanP(metadata))
                    .collect(toList());
        }

        @Override
        public List permissions() {
            return singletonList(new MapPermission(metadata.getMapName(), ACTION_CREATE, ACTION_READ));
        }

        @Override
        public void writeData(ObjectDataOutput out) throws IOException {
            out.writeObject(metadata);
        }

        @Override
        public void readData(ObjectDataInput in) throws IOException {
            metadata = in.readObject();
        }
    }
}