org.elasticsearch.compute.aggregation.blockhash.BlockHash Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of x-pack-esql-compute Show documentation
Elasticsearch subproject :x-pack:plugin:esql:compute
There is a newer version: 8.16.1
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0; you may not use this file except in compliance with the Elastic License
 * 2.0.
 */

package org.elasticsearch.compute.aggregation.blockhash;

import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.BitArray;
import org.elasticsearch.common.util.BytesRefHash;
import org.elasticsearch.common.util.LongHash;
import org.elasticsearch.common.util.LongLongHash;
import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
import org.elasticsearch.compute.aggregation.SeenGroupIds;
import org.elasticsearch.compute.data.Block;
import org.elasticsearch.compute.data.BlockFactory;
import org.elasticsearch.compute.data.ElementType;
import org.elasticsearch.compute.data.IntBlock;
import org.elasticsearch.compute.data.IntVector;
import org.elasticsearch.compute.data.Page;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.ReleasableIterator;

import java.util.Iterator;
import java.util.List;

/**
 * A specialized hash table implementation maps values of a {@link Block} to ids (in longs).
 * This class delegates to {@link LongHash} or {@link BytesRefHash}.
 *
 * @see LongHash
 * @see BytesRefHash
 */
public abstract sealed class BlockHash implements Releasable, SeenGroupIds //
    permits BooleanBlockHash, BytesRefBlockHash, DoubleBlockHash, IntBlockHash, LongBlockHash, BytesRef3BlockHash, //
    NullBlockHash, PackedValuesBlockHash, BytesRefLongBlockHash, LongLongBlockHash, TimeSeriesBlockHash {

    protected final BlockFactory blockFactory;

    BlockHash(BlockFactory blockFactory) {
        this.blockFactory = blockFactory;
    }

    /**
     * Add all values for the "group by" columns in the page to the hash and
     * pass the ordinals to the provided {@link GroupingAggregatorFunction.AddInput}.
     */
    public abstract void add(Page page, GroupingAggregatorFunction.AddInput addInput);

    /**
     * Lookup all values for the "group by" columns in the page to the hash and return an
     * {@link Iterator} of the values. The sum of {@link IntBlock#getPositionCount} for
     * all blocks returned by the iterator will equal {@link Page#getPositionCount} but
     * will "target" a size of {@code targetBlockSize}.
     * 
     *     The returned {@link ReleasableIterator} may retain a reference to {@link Block}s
     *     inside the {@link Page}. Close it to release those references.
     * 
     */
    public abstract ReleasableIterator lookup(Page page, ByteSizeValue targetBlockSize);

    /**
     * Returns a {@link Block} that contains all the keys that are inserted by {@link #add}.
     */
    public abstract Block[] getKeys();

    /**
     * The grouping ids that are not empty. We use this because some block hashes reserve
     * space for grouping ids and then don't end up using them. For example,
     * {@link BooleanBlockHash} does this by always assigning {@code false} to {@code 0}
     * and {@code true} to {@code 1}. It's only after collection when we
     * know if there actually were any {@code true} or {@code false} values received.
     */
    public abstract IntVector nonEmpty();

    // TODO merge with nonEmpty
    @Override
    public abstract BitArray seenGroupIds(BigArrays bigArrays);

    public record GroupSpec(int channel, ElementType elementType) {}

    /**
     * Creates a specialized hash table that maps one or more {@link Block}s to ids.
     * @param emitBatchSize maximum batch size to be emitted when handling combinatorial
     *                      explosion of groups caused by multivalued fields
     * @param allowBrokenOptimizations true to allow optimizations with bad null handling. We will fix their
     *                                 null handling and remove this flag, but we need to disable these in
     *                                 production until we can. And this lets us continue to compile and
     *                                 test them.
     */
    public static BlockHash build(List groups, BlockFactory blockFactory, int emitBatchSize, boolean allowBrokenOptimizations) {
        if (groups.size() == 1) {
            return newForElementType(groups.get(0).channel(), groups.get(0).elementType(), blockFactory);
        }
        if (groups.size() == 3 && groups.stream().allMatch(g -> g.elementType == ElementType.BYTES_REF)) {
            return new BytesRef3BlockHash(blockFactory, groups.get(0).channel, groups.get(1).channel, groups.get(2).channel, emitBatchSize);
        }
        if (allowBrokenOptimizations && groups.size() == 2) {
            var g1 = groups.get(0);
            var g2 = groups.get(1);
            if (g1.elementType() == ElementType.LONG && g2.elementType() == ElementType.LONG) {
                return new LongLongBlockHash(blockFactory, g1.channel(), g2.channel(), emitBatchSize);
            }
            if (g1.elementType() == ElementType.BYTES_REF && g2.elementType() == ElementType.LONG) {
                return new BytesRefLongBlockHash(blockFactory, g1.channel(), g2.channel(), false, emitBatchSize);
            }
            if (g1.elementType() == ElementType.LONG && g2.elementType() == ElementType.BYTES_REF) {
                return new BytesRefLongBlockHash(blockFactory, g2.channel(), g1.channel(), true, emitBatchSize);
            }
        }
        return new PackedValuesBlockHash(groups, blockFactory, emitBatchSize);
    }

    /**
     * Temporary method to build a {@link PackedValuesBlockHash}.
     */
    public static BlockHash buildPackedValuesBlockHash(List groups, BlockFactory blockFactory, int emitBatchSize) {
        return new PackedValuesBlockHash(groups, blockFactory, emitBatchSize);
    }

    /**
     * Creates a specialized hash table that maps a {@link Block} of the given input element type to ids.
     */
    private static BlockHash newForElementType(int channel, ElementType type, BlockFactory blockFactory) {
        return switch (type) {
            case NULL -> new NullBlockHash(channel, blockFactory);
            case BOOLEAN -> new BooleanBlockHash(channel, blockFactory);
            case INT -> new IntBlockHash(channel, blockFactory);
            case LONG -> new LongBlockHash(channel, blockFactory);
            case DOUBLE -> new DoubleBlockHash(channel, blockFactory);
            case BYTES_REF -> new BytesRefBlockHash(channel, blockFactory);
            default -> throw new IllegalArgumentException("unsupported grouping element type [" + type + "]");
        };
    }

    /**
     * Convert the result of calling {@link LongHash} or {@link LongLongHash}
     * or {@link BytesRefHash} or similar to a group ordinal. These hashes
     * return negative numbers if the value that was added has already been
     * seen. We don't use that and convert it back to the positive ord.
     */
    public static long hashOrdToGroup(long ord) {
        if (ord < 0) { // already seen
            return -1 - ord;
        }
        return ord;
    }

    /**
     * Convert the result of calling {@link LongHash} or {@link LongLongHash}
     * or {@link BytesRefHash} or similar to a group ordinal, reserving {@code 0}
     * for null.
     */
    public static long hashOrdToGroupNullReserved(long ord) {
        return hashOrdToGroup(ord) + 1;
    }
}