org.apache.kylin.engine.mr.common.CubeStatsReader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kylin.engine.mr.common;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.persistence.RawResource;
import org.apache.kylin.common.persistence.ResourceStore;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.SumHelper;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.cuboid.Cuboid;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
import org.apache.kylin.cube.kv.CubeDimEncMap;
import org.apache.kylin.cube.kv.RowKeyEncoder;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.measure.topn.TopNMeasureType;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* This should be in cube module. It's here in engine-mr because currently stats
* are saved as sequence files thus a hadoop dependency.
*/
public class CubeStatsReader {
private static final Logger logger = LoggerFactory.getLogger(CubeStatsReader.class);
final CubeSegment seg;
final int samplingPercentage;
final int mapperNumberOfFirstBuild; // becomes meaningless after merge
final double mapperOverlapRatioOfFirstBuild; // becomes meaningless after merge
final Map cuboidRowEstimatesHLL;
final CuboidScheduler cuboidScheduler;
final long sourceRowCount;
public CubeStatsReader(CubeSegment cubeSegment, KylinConfig kylinConfig) throws IOException {
this(cubeSegment, cubeSegment.getCuboidScheduler(), kylinConfig);
}
/**
* @param cuboidScheduler if it's null, part of it's functions will not be supported
*/
public CubeStatsReader(CubeSegment cubeSegment, CuboidScheduler cuboidScheduler, KylinConfig kylinConfig)
throws IOException {
ResourceStore store = ResourceStore.getStore(kylinConfig);
String statsKey = cubeSegment.getStatisticsResourcePath();
RawResource resource = store.getResource(statsKey);
if (resource == null)
throw new IllegalStateException("Missing resource at " + statsKey);
File tmpSeqFile = writeTmpSeqFile(resource.content());
Path path = new Path(HadoopUtil.fixWindowsPath("file://" + tmpSeqFile.getAbsolutePath()));
CubeStatsResult cubeStatsResult = new CubeStatsResult(path, kylinConfig.getCubeStatsHLLPrecision());
tmpSeqFile.delete();
this.seg = cubeSegment;
this.cuboidScheduler = cuboidScheduler;
this.samplingPercentage = cubeStatsResult.getPercentage();
this.mapperNumberOfFirstBuild = cubeStatsResult.getMapperNumber();
this.mapperOverlapRatioOfFirstBuild = cubeStatsResult.getMapperOverlapRatio();
this.cuboidRowEstimatesHLL = cubeStatsResult.getCounterMap();
this.sourceRowCount = cubeStatsResult.getSourceRecordCount();
}
/**
* Read statistics from
* @param path
* rather than
* @param cubeSegment
*
* Since the statistics are from
* @param path
* cuboid scheduler should be provided by default
*/
public CubeStatsReader(CubeSegment cubeSegment, CuboidScheduler cuboidScheduler, KylinConfig kylinConfig, Path path)
throws IOException {
CubeStatsResult cubeStatsResult = new CubeStatsResult(path, kylinConfig.getCubeStatsHLLPrecision());
this.seg = cubeSegment;
this.cuboidScheduler = cuboidScheduler;
this.samplingPercentage = cubeStatsResult.getPercentage();
this.mapperNumberOfFirstBuild = cubeStatsResult.getMapperNumber();
this.mapperOverlapRatioOfFirstBuild = cubeStatsResult.getMapperOverlapRatio();
this.cuboidRowEstimatesHLL = cubeStatsResult.getCounterMap();
this.sourceRowCount = cubeStatsResult.getSourceRecordCount();
}
private File writeTmpSeqFile(InputStream inputStream) throws IOException {
File tempFile = File.createTempFile("kylin_stats_tmp", ".seq");
FileOutputStream out = null;
try {
out = new FileOutputStream(tempFile);
org.apache.commons.io.IOUtils.copy(inputStream, out);
} finally {
IOUtils.closeStream(inputStream);
IOUtils.closeStream(out);
}
return tempFile;
}
public Map getCuboidRowHLLCounters() {
return this.cuboidRowEstimatesHLL;
}
public int getSamplingPercentage() {
return samplingPercentage;
}
public Map getCuboidRowEstimatesHLL() {
return getCuboidRowCountMapFromSampling(cuboidRowEstimatesHLL, samplingPercentage);
}
// return map of Cuboid ID => MB
public Map getCuboidSizeMap() {
return getCuboidSizeMapFromRowCount(seg, getCuboidRowEstimatesHLL(), sourceRowCount);
}
public double estimateCubeSize() {
return SumHelper.sumDouble(getCuboidSizeMap().values());
}
public int getMapperNumberOfFirstBuild() {
return mapperNumberOfFirstBuild;
}
public double getMapperOverlapRatioOfFirstBuild() {
return mapperOverlapRatioOfFirstBuild;
}
public static Map getCuboidRowCountMapFromSampling(Map hllcMap,
int samplingPercentage) {
Map cuboidRowCountMap = Maps.newHashMap();
for (Map.Entry entry : hllcMap.entrySet()) {
// No need to adjust according sampling percentage. Assumption is that data set is far
// more than cardinality. Even a percentage of the data should already see all cardinalities.
cuboidRowCountMap.put(entry.getKey(), entry.getValue().getCountEstimate());
}
return cuboidRowCountMap;
}
public static Map getCuboidSizeMapFromRowCount(CubeSegment cubeSegment, Map rowCountMap,
long sourceRowCount) {
final CubeDesc cubeDesc = cubeSegment.getCubeDesc();
final List rowkeyColumnSize = Lists.newArrayList();
final Cuboid baseCuboid = Cuboid.getBaseCuboid(cubeDesc);
final List columnList = baseCuboid.getColumns();
final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap();
final Long baseCuboidRowCount = rowCountMap.get(baseCuboid.getId());
for (int i = 0; i < columnList.size(); i++) {
rowkeyColumnSize.add(dimEncMap.get(columnList.get(i)).getLengthOfEncoding());
}
Map sizeMap = Maps.newHashMap();
for (Map.Entry entry : rowCountMap.entrySet()) {
sizeMap.put(entry.getKey(), estimateCuboidStorageSize(cubeSegment, entry.getKey(), entry.getValue(),
baseCuboid.getId(), baseCuboidRowCount, rowkeyColumnSize, sourceRowCount));
}
return sizeMap;
}
/**
* Estimate the cuboid's size
*
* @return the cuboid size in M bytes
*/
private static double estimateCuboidStorageSize(CubeSegment cubeSegment, long cuboidId, long rowCount,
long baseCuboidId, long baseCuboidCount, List rowKeyColumnLength, long sourceRowCount) {
int rowkeyLength = cubeSegment.getRowKeyPreambleSize();
KylinConfig kylinConf = cubeSegment.getConfig();
long mask = Long.highestOneBit(baseCuboidId);
long parentCuboidIdActualLength = (long) Long.SIZE - Long.numberOfLeadingZeros(baseCuboidId);
for (int i = 0; i < parentCuboidIdActualLength; i++) {
if ((mask & cuboidId) > 0) {
rowkeyLength += rowKeyColumnLength.get(i); //colIO.getColumnLength(columnList.get(i));
}
mask = mask >> 1;
}
// add the measure length
int normalSpace = rowkeyLength;
int countDistinctSpace = 0;
double percentileSpace = 0;
int topNSpace = 0;
for (MeasureDesc measureDesc : cubeSegment.getCubeDesc().getMeasures()) {
if (rowCount == 0)
break;
DataType returnType = measureDesc.getFunction().getReturnDataType();
if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_COUNT_DISTINCT)) {
long estimateDistinctCount = sourceRowCount / rowCount;
estimateDistinctCount = estimateDistinctCount == 0 ? 1L : estimateDistinctCount;
countDistinctSpace += returnType.getStorageBytesEstimate(estimateDistinctCount);
} else if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_PERCENTILE)) {
percentileSpace += returnType.getStorageBytesEstimate(baseCuboidCount * 1.0 / rowCount);
} else if (measureDesc.getFunction().getExpression().equals(TopNMeasureType.FUNC_TOP_N)) {
long estimateTopNCount = sourceRowCount / rowCount;
estimateTopNCount = estimateTopNCount == 0 ? 1L : estimateTopNCount;
topNSpace += returnType.getStorageBytesEstimate(estimateTopNCount);
} else {
normalSpace += returnType.getStorageBytesEstimate();
}
}
double cuboidSizeRatio = kylinConf.getJobCuboidSizeRatio();
double cuboidSizeMemHungryRatio = kylinConf.getJobCuboidSizeCountDistinctRatio();
double cuboidSizeTopNRatio = kylinConf.getJobCuboidSizeTopNRatio();
double ret = (1.0 * normalSpace * rowCount * cuboidSizeRatio
+ 1.0 * countDistinctSpace * rowCount * cuboidSizeMemHungryRatio + 1.0 * percentileSpace * rowCount
+ 1.0 * topNSpace * rowCount * cuboidSizeTopNRatio) / (1024L * 1024L);
return ret;
}
private void print(PrintWriter out) {
Map cuboidRows = getCuboidRowEstimatesHLL();
Map cuboidSizes = getCuboidSizeMap();
List cuboids = new ArrayList(cuboidRows.keySet());
Collections.sort(cuboids);
out.println("============================================================================");
out.println("Statistics of " + seg);
out.println();
out.println(
"Cube statistics hll precision: " + cuboidRowEstimatesHLL.values().iterator().next().getPrecision());
out.println("Total cuboids: " + cuboidRows.size());
out.println("Total estimated rows: " + SumHelper.sumLong(cuboidRows.values()));
out.println("Total estimated size(MB): " + SumHelper.sumDouble(cuboidSizes.values()));
out.println("Sampling percentage: " + samplingPercentage);
out.println("Mapper overlap ratio: " + mapperOverlapRatioOfFirstBuild);
out.println("Mapper number: " + mapperNumberOfFirstBuild);
printKVInfo(out);
printCuboidInfoTreeEntry(cuboidRows, cuboidSizes, out);
out.println("----------------------------------------------------------------------------");
}
//return MB
public double estimateLayerSize(int level) {
if (cuboidScheduler == null) {
throw new UnsupportedOperationException("cuboid scheduler is null");
}
List> layeredCuboids = cuboidScheduler.getCuboidsByLayer();
Map cuboidSizeMap = getCuboidSizeMap();
double ret = 0;
for (Long cuboidId : layeredCuboids.get(level)) {
ret += cuboidSizeMap.get(cuboidId) == null ? 0.0 : cuboidSizeMap.get(cuboidId);
}
logger.info("Estimating size for layer {}, all cuboids are {}, total size is {}", level,
StringUtils.join(layeredCuboids.get(level), ","), ret);
return ret;
}
public List getCuboidsByLayer(int level) {
if (cuboidScheduler == null) {
throw new UnsupportedOperationException("cuboid scheduler is null");
}
List> layeredCuboids = cuboidScheduler.getCuboidsByLayer();
return layeredCuboids.get(level);
}
private void printCuboidInfoTreeEntry(Map cuboidRows, Map cuboidSizes, PrintWriter out) {
if (cuboidScheduler == null) {
throw new UnsupportedOperationException("cuboid scheduler is null");
}
long baseCuboid = Cuboid.getBaseCuboidId(seg.getCubeDesc());
int dimensionCount = Long.bitCount(baseCuboid);
printCuboidInfoTree(-1L, baseCuboid, cuboidScheduler, cuboidRows, cuboidSizes, dimensionCount, 0, out);
}
private void printKVInfo(PrintWriter writer) {
Cuboid cuboid = Cuboid.getBaseCuboid(seg.getCubeDesc());
RowKeyEncoder encoder = new RowKeyEncoder(seg, cuboid);
for (TblColRef col : cuboid.getColumns()) {
writer.println("Length of dimension " + col + " is " + encoder.getColumnLength(col));
}
}
private static void printCuboidInfoTree(long parent, long cuboidID, final CuboidScheduler scheduler,
Map cuboidRows, Map cuboidSizes, int dimensionCount, int depth, PrintWriter out) {
printOneCuboidInfo(parent, cuboidID, cuboidRows, cuboidSizes, dimensionCount, depth, out);
List children = scheduler.getSpanningCuboid(cuboidID);
Collections.sort(children);
for (Long child : children) {
printCuboidInfoTree(cuboidID, child, scheduler, cuboidRows, cuboidSizes, dimensionCount, depth + 1, out);
}
}
private static void printOneCuboidInfo(long parent, long cuboidID, Map cuboidRows,
Map cuboidSizes, int dimensionCount, int depth, PrintWriter out) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
String cuboidName = Cuboid.getDisplayName(cuboidID, dimensionCount);
sb.append("|---- Cuboid ").append(cuboidName);
long rowCount = cuboidRows.get(cuboidID);
double size = cuboidSizes.get(cuboidID);
sb.append(", est row: ").append(rowCount).append(", est MB: ").append(formatDouble(size));
if (parent != -1) {
sb.append(", shrink: ").append(formatDouble(100.0 * cuboidRows.get(cuboidID) / cuboidRows.get(parent)))
.append("%");
}
out.println(sb.toString());
}
private static String formatDouble(double input) {
return new DecimalFormat("#.##", DecimalFormatSymbols.getInstance(Locale.ROOT)).format(input);
}
public static class CubeStatsResult {
private int percentage = 100;
private double mapperOverlapRatio = 0;
private long sourceRecordCount = 0;
private int mapperNumber = 0;
private Map counterMap = Maps.newHashMap();
public CubeStatsResult(Path path, int precision) throws IOException {
Configuration hadoopConf = HadoopUtil.getCurrentConfiguration();
Option seqInput = SequenceFile.Reader.file(path);
try (Reader reader = new SequenceFile.Reader(hadoopConf, seqInput)) {
LongWritable key = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), hadoopConf);
BytesWritable value = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), hadoopConf);
while (reader.next(key, value)) {
if (key.get() == 0L) {
percentage = Bytes.toInt(value.getBytes());
} else if (key.get() == -1) {
mapperOverlapRatio = Bytes.toDouble(value.getBytes());
} else if (key.get() == -2) {
mapperNumber = Bytes.toInt(value.getBytes());
} else if (key.get() == -3) {
sourceRecordCount = Bytes.toLong(value.getBytes());
} else if (key.get() > 0) {
HLLCounter hll = new HLLCounter(precision);
ByteArray byteArray = new ByteArray(value.getBytes());
hll.readRegisters(byteArray.asBuffer());
counterMap.put(key.get(), hll);
}
}
}
}
public int getPercentage() {
return percentage;
}
public double getMapperOverlapRatio() {
return mapperOverlapRatio;
}
public int getMapperNumber() {
return mapperNumber;
}
public Map getCounterMap() {
return Collections.unmodifiableMap(counterMap);
}
public long getSourceRecordCount() {
return sourceRecordCount;
}
}
public static void main(String[] args) throws IOException {
System.out.println("CubeStatsReader is used to read cube statistic saved in metadata store");
KylinConfig config = KylinConfig.getInstanceFromEnv();
CubeInstance cube = CubeManager.getInstance(config).getCube(args[0]);
List segments = cube.getSegments();
PrintWriter out = new PrintWriter(
new BufferedWriter(new OutputStreamWriter(System.out, StandardCharsets.UTF_8)));
for (CubeSegment seg : segments) {
try {
new CubeStatsReader(seg, config).print(out);
} catch (Exception e) {
logger.info("CubeStatsReader for Segment {} failed, skip it.", seg.getName());
}
}
out.flush();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy