All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.kylin.engine.mr.steps.FactDistinctColumnsMapper Maven / Gradle / Ivy

There is a newer version: 3.1.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.engine.mr.steps;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import org.apache.hadoop.io.Text;
import org.apache.kylin.common.KylinVersion;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.MemoryBudgetController;
import org.apache.kylin.common.util.StringUtil;
import org.apache.kylin.cube.DimensionRangeInfo;
import org.apache.kylin.cube.cuboid.CuboidUtil;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.StatisticsDecisionUtil;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.measure.hllc.RegisterType;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

/**
 */
public class FactDistinctColumnsMapper extends FactDistinctColumnsMapperBase {

    private static final Logger logger = LoggerFactory.getLogger(FactDistinctColumnsMapper.class);

    public static enum RawDataCounter {
        BYTES
    }

    protected int nRowKey;
    private Integer[][] allCuboidsBitSet = null;
    private HLLCounter[] allCuboidsHLL = null;
    private Long[] cuboidIds;
    private int rowCount = 0;
    private int samplingPercentage;
    private ByteBuffer tmpbuf;
    
    private DictColDeduper dictColDeduper;
    private Map dimensionRangeInfoMap = Maps.newHashMap();

    private CuboidStatCalculator[] cuboidStatCalculators;

    private static final Text EMPTY_TEXT = new Text();

    private SelfDefineSortableKey sortableKey = new SelfDefineSortableKey();

    @Override
    protected void doSetup(Context context) throws IOException {
        super.doSetup(context);
        tmpbuf = ByteBuffer.allocate(4096);

        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        nRowKey = cubeDesc.getRowkey().getRowKeyColumns().length;

        Set cuboidIdSet = Sets.newHashSet(cubeSeg.getCuboidScheduler().getAllCuboidIds());
        if (StatisticsDecisionUtil.isAbleToOptimizeCubingPlan(cubeSeg)) {
            // For cube planner, for every prebuilt cuboid, its related row count stats should be calculated
            // If the precondition for trigger cube planner phase one is satisfied, we need to calculate row count stats for mandatory cuboids.
            cuboidIdSet.addAll(cubeSeg.getCubeDesc().getMandatoryCuboids());
        }
        cuboidIds = cuboidIdSet.toArray(new Long[cuboidIdSet.size()]);
        allCuboidsBitSet = CuboidUtil.getCuboidBitSet(cuboidIds, nRowKey);

        allCuboidsHLL = new HLLCounter[cuboidIds.length];
        for (int i = 0; i < cuboidIds.length; i++) {
            allCuboidsHLL[i] = new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision(), RegisterType.DENSE);
        }

        //for KYLIN-2518 backward compatibility
        boolean isUsePutRowKeyToHllNewAlgorithm;
        if (KylinVersion.isBefore200(cubeDesc.getVersion())) {
            isUsePutRowKeyToHllNewAlgorithm = false;
            logger.info("Found KylinVersion : {}. Use old algorithm for cuboid sampling.", cubeDesc.getVersion());
        } else {
            isUsePutRowKeyToHllNewAlgorithm = true;
            logger.info(
                    "Found KylinVersion : {}. Use new algorithm for cuboid sampling. About the details of the new algorithm, please refer to KYLIN-2518",
                    cubeDesc.getVersion());
        }

        int calculatorNum = getStatsThreadNum(cuboidIds.length);
        cuboidStatCalculators = new CuboidStatCalculator[calculatorNum];
        int splitSize = cuboidIds.length / calculatorNum;
        if (splitSize <= 0) {
            splitSize = 1;
        }
        for (int i = 0; i < calculatorNum; i++) {
            HLLCounter[] cuboidsHLLSplit;
            Integer[][] cuboidsBitSetSplit;
            Long[] cuboidIdSplit;
            int start = i * splitSize;
            if (start >= cuboidIds.length) {
                break;
            }
            int end = (i + 1) * splitSize;
            if (i == calculatorNum - 1) {// last split
                end = cuboidIds.length;
            }

            cuboidsHLLSplit = Arrays.copyOfRange(allCuboidsHLL, start, end);
            cuboidsBitSetSplit = Arrays.copyOfRange(allCuboidsBitSet, start, end);
            cuboidIdSplit = Arrays.copyOfRange(cuboidIds, start, end);
            CuboidStatCalculator calculator = new CuboidStatCalculator(i,
                    intermediateTableDesc.getRowKeyColumnIndexes(), cuboidIdSplit, cuboidsBitSetSplit,
                    isUsePutRowKeyToHllNewAlgorithm, cuboidsHLLSplit);
            cuboidStatCalculators[i] = calculator;
            calculator.start();
        }
        
        // setup dict col deduper
        dictColDeduper = new DictColDeduper();
        Set dictCols = cubeDesc.getAllColumnsNeedDictionaryBuilt();
        for (int i = 0; i < allCols.size(); i++) {
            if (dictCols.contains(allCols.get(i)))
                dictColDeduper.setIsDictCol(i);
        }
    }

    private int getStatsThreadNum(int cuboidNum) {
        int unitNum = cubeDesc.getConfig().getCuboidNumberPerStatsCalculator();
        if (unitNum <= 0) {
            logger.warn("config from getCuboidNumberPerStatsCalculator() " + unitNum + " is should larger than 0");
            logger.info("Will use single thread for cuboid statistics calculation");
            return 1;
        }

        int maxCalculatorNum = cubeDesc.getConfig().getCuboidStatsCalculatorMaxNumber();
        int calculatorNum = (cuboidNum - 1) / unitNum + 1;
        if (calculatorNum > maxCalculatorNum) {
            calculatorNum = maxCalculatorNum;
        }
        return calculatorNum;
    }

    @Override
    public void doMap(KEYIN key, Object record, Context context) throws IOException, InterruptedException {
        Collection rowCollection = flatTableInputFormat.parseMapperInput(record);

        for (String[] row : rowCollection) {
            context.getCounter(RawDataCounter.BYTES).increment(countSizeInBytes(row));
            for (int i = 0; i < allCols.size(); i++) {
                String fieldValue = row[columnIndex[i]];
                if (fieldValue == null)
                    continue;

                final DataType type = allCols.get(i).getType();

                //for dic column, de dup before write value; for dim not dic column, hold util doCleanup()
                if (dictColDeduper.isDictCol(i)) {
                    if (dictColDeduper.add(i, fieldValue)) {
                        writeFieldValue(context, type, i, fieldValue);
                    }
                } else {
                    DimensionRangeInfo old = dimensionRangeInfoMap.get(i);
                    if (old == null) {
                        old = new DimensionRangeInfo(fieldValue, fieldValue);
                        dimensionRangeInfoMap.put(i, old);
                    } else {
                        old.setMax(type.getOrder().max(old.getMax(), fieldValue));
                        old.setMin(type.getOrder().min(old.getMin(), fieldValue));
                    }
                }
            }

            if (rowCount % 100 < samplingPercentage) {
                putRowKeyToHLL(row);
            }
            
            if (rowCount % 100 == 0) {
                dictColDeduper.resetIfShortOfMem();
            }

            rowCount++;
        }
    }
    
    private void putRowKeyToHLL(String[] row) {
        for (CuboidStatCalculator cuboidStatCalculator : cuboidStatCalculators) {
            cuboidStatCalculator.putRow(row);
        }
    }

    private long countSizeInBytes(String[] row) {
        int size = 0;
        for (String s : row) {
            size += s == null ? 1 : StringUtil.utf8Length(s);
            size++; // delimiter
        }
        return size;
    }

    @Override
    protected void doCleanup(Context context) throws IOException, InterruptedException {
        ByteBuffer hllBuf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
        // output each cuboid's hll to reducer, key is 0 - cuboidId
        for (CuboidStatCalculator cuboidStatCalculator : cuboidStatCalculators) {
            cuboidStatCalculator.waitForCompletion();
        }
        for (CuboidStatCalculator cuboidStatCalculator : cuboidStatCalculators) {
            Long[] cuboidIds = cuboidStatCalculator.getCuboidIds();
            HLLCounter[] cuboidsHLL = cuboidStatCalculator.getHLLCounters();
            HLLCounter hll;

            for (int i = 0; i < cuboidIds.length; i++) {
                hll = cuboidsHLL[i];
                tmpbuf.clear();
                tmpbuf.put((byte) FactDistinctColumnsReducerMapping.MARK_FOR_HLL_COUNTER); // one byte
                tmpbuf.putLong(cuboidIds[i]);
                outputKey.set(tmpbuf.array(), 0, tmpbuf.position());
                hllBuf.clear();
                hll.writeRegisters(hllBuf);
                outputValue.set(hllBuf.array(), 0, hllBuf.position());
                sortableKey.init(outputKey, (byte) 0);
                context.write(sortableKey, outputValue);
            }
        }
        for (Integer colIndex : dimensionRangeInfoMap.keySet()) {
            DimensionRangeInfo rangeInfo = dimensionRangeInfoMap.get(colIndex);
            DataType dataType = allCols.get(colIndex).getType();
            writeFieldValue(context, dataType, colIndex, rangeInfo.getMin());
            writeFieldValue(context, dataType, colIndex, rangeInfo.getMax());
        }
    }

    private int countNewSize(int oldSize, int dataSize) {
        int newSize = oldSize * 2;
        while (newSize < dataSize) {
            newSize = newSize * 2;
        }
        return newSize;
    }

    private void writeFieldValue(Context context, DataType type, Integer colIndex, String value)
            throws IOException, InterruptedException {
        int reducerIndex = reducerMapping.getReducerIdForCol(colIndex, value);
        tmpbuf.clear();
        byte[] valueBytes = Bytes.toBytes(value);
        int size = valueBytes.length + 1;
        if (size >= tmpbuf.capacity()) {
            tmpbuf = ByteBuffer.allocate(countNewSize(tmpbuf.capacity(), size));
        }
        tmpbuf.put(Bytes.toBytes(reducerIndex)[3]);
        tmpbuf.put(valueBytes);
        outputKey.set(tmpbuf.array(), 0, tmpbuf.position());
        sortableKey.init(outputKey, type);
        context.write(sortableKey, EMPTY_TEXT);
        // log a few rows for troubleshooting
        if (rowCount < 10) {
            logger.info("Sample output: " + allCols.get(colIndex) + " '" + value + "' => reducer " + reducerIndex);
        }
    }

    public static class CuboidStatCalculator implements Runnable {
        private final int id;
        private final int nRowKey;
        private final int[] rowkeyColIndex;
        private final Long[] cuboidIds;
        private final Integer[][] cuboidsBitSet;
        private volatile HLLCounter[] cuboidsHLL = null;

        //about details of the new algorithm, please see KYLIN-2518
        private final boolean isNewAlgorithm;
        private final HashFunction hf;
        private long[] rowHashCodesLong;

        private BlockingQueue queue = new LinkedBlockingQueue(2000);
        private Thread workThread;
        private volatile boolean stop;

        public CuboidStatCalculator(int id, int[] rowkeyColIndex, Long[] cuboidIds, Integer[][] cuboidsBitSet,
                boolean isUsePutRowKeyToHllNewAlgorithm, HLLCounter[] cuboidsHLL) {
            this.id = id;
            this.nRowKey = rowkeyColIndex.length;
            this.rowkeyColIndex = rowkeyColIndex;
            this.cuboidIds = cuboidIds;
            this.cuboidsBitSet = cuboidsBitSet;
            this.isNewAlgorithm = isUsePutRowKeyToHllNewAlgorithm;
            if (!isNewAlgorithm) {
                this.hf = Hashing.murmur3_32();
            } else {
                rowHashCodesLong = new long[nRowKey];
                this.hf = Hashing.murmur3_128();
            }
            this.cuboidsHLL = cuboidsHLL;
            workThread = new Thread(this);
        }

        public void start() {
            logger.info("cuboid stats calculator:" + id + " started, handle cuboids number:" + cuboidIds.length);
            workThread.start();
        }

        public void putRow(final String[] row) {
            String[] copyRow = Arrays.copyOf(row, row.length);
            try {
                queue.put(copyRow);
            } catch (InterruptedException e) {
                logger.error("interrupt", e);
            }
        }

        public void waitForCompletion() {
            stop = true;
            try {
                workThread.join();
            } catch (InterruptedException e) {
                logger.error("interrupt", e);
            }
        }

        private void putRowKeyToHLLOld(String[] row) {
            //generate hash for each row key column
            byte[][] rowHashCodes = new byte[nRowKey][];
            for (int i = 0; i < nRowKey; i++) {
                Hasher hc = hf.newHasher();
                String colValue = row[rowkeyColIndex[i]];
                if (colValue != null) {
                    rowHashCodes[i] = hc.putString(colValue).hash().asBytes();
                } else {
                    rowHashCodes[i] = hc.putInt(0).hash().asBytes();
                }
            }

            // user the row key column hash to get a consolidated hash for each cuboid
            for (int i = 0, n = cuboidsBitSet.length; i < n; i++) {
                Hasher hc = hf.newHasher();
                for (int position = 0; position < cuboidsBitSet[i].length; position++) {
                    hc.putBytes(rowHashCodes[cuboidsBitSet[i][position]]);
                }

                cuboidsHLL[i].add(hc.hash().asBytes());
            }
        }

        private void putRowKeyToHLLNew(String[] row) {
            //generate hash for each row key column
            for (int i = 0; i < nRowKey; i++) {
                Hasher hc = hf.newHasher();
                String colValue = row[rowkeyColIndex[i]];
                if (colValue == null)
                    colValue = "0";
                byte[] bytes = hc.putString(colValue).hash().asBytes();
                rowHashCodesLong[i] = (Bytes.toLong(bytes) + i);//add column ordinal to the hash value to distinguish between (a,b) and (b,a)
            }

            // user the row key column hash to get a consolidated hash for each cuboid
            for (int i = 0, n = cuboidsBitSet.length; i < n; i++) {
                long value = 0;
                for (int position = 0; position < cuboidsBitSet[i].length; position++) {
                    value += rowHashCodesLong[cuboidsBitSet[i][position]];
                }
                cuboidsHLL[i].addHashDirectly(value);
            }
        }

        public HLLCounter[] getHLLCounters() {
            return cuboidsHLL;
        }

        public Long[] getCuboidIds() {
            return cuboidIds;
        }

        @Override
        public void run() {
            while (true) {
                String[] row = queue.poll();
                if (row == null && stop) {
                    logger.info("cuboid stats calculator:" + id + " completed.");
                    break;
                } else if (row == null) {
                    Thread.yield();
                    continue;
                }
                if (isNewAlgorithm) {
                    putRowKeyToHLLNew(row);
                } else {
                    putRowKeyToHLLOld(row);
                }
            }
        }
    }
    
    public static class DictColDeduper {

        final boolean enabled;
        final int resetThresholdMB;
        final Map> colValueSets = Maps.newHashMap();
        
        public DictColDeduper() {
            this(200, 100);
        }
        
        public DictColDeduper(int enableThresholdMB, int resetThresholdMB) {
            // only enable when there is sufficient memory
            this.enabled = MemoryBudgetController.getSystemAvailMB() >= enableThresholdMB;
            this.resetThresholdMB = resetThresholdMB;
        }
        
        public void setIsDictCol(int i) {
            colValueSets.put(i, new HashSet());
        }
        
        public boolean isDictCol(int i) {
            return colValueSets.containsKey(i);
        }

        public boolean add(int i, String fieldValue) {
            return colValueSets.get(i).add(fieldValue);
        }
        
        public Set getValueSet(int i) {
            return colValueSets.get(i);
        }

        public void resetIfShortOfMem() {
            if (MemoryBudgetController.getSystemAvailMB() < resetThresholdMB) {
                for (Set set : colValueSets.values())
                    set.clear();
            }
        }
        
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy