org.apache.kylin.engine.mr.steps.MergeDictionaryMapper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kylin-engine-mr
Apache Kylin - MapReduce Engine
There is a newer version: 3.1.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.kylin.engine.mr.steps;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.persistence.ResourceStore;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.StringUtil;
import org.apache.kylin.cube.CubeDescManager;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.dict.DictionaryInfo;
import org.apache.kylin.dict.DictionaryManager;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.CubeStatsWriter;
import org.apache.kylin.engine.mr.common.SerializableConfiguration;
import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

public class MergeDictionaryMapper extends KylinMapper {
    private static final Logger logger = LoggerFactory.getLogger(MergeDictionaryMapper.class);

    List mergingSegments;
    TblColRef[] tblColRefs;
    DictionaryManager dictMgr;

    @Override
    protected void doSetup(Context context) throws IOException, InterruptedException {
        super.doSetup(context);

        final SerializableConfiguration sConf = new SerializableConfiguration(context.getConfiguration());
        final String metaUrl = context.getConfiguration().get(BatchConstants.ARG_META_URL);
        final String cubeName = context.getConfiguration().get(BatchConstants.ARG_CUBE_NAME);
        final String segmentIds = context.getConfiguration().get(MergeDictionaryJob.OPTION_MERGE_SEGMENT_IDS.getOpt());

        final KylinConfig kylinConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);
        final CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);
        final CubeDesc cubeDesc = CubeDescManager.getInstance(kylinConfig).getCubeDesc(cubeInstance.getDescName());

        mergingSegments = getMergingSegments(cubeInstance, StringUtil.splitByComma(segmentIds));
        tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
        dictMgr = DictionaryManager.getInstance(kylinConfig);
    }

    @Override
    protected void doMap(IntWritable key, NullWritable value, Context context)
            throws IOException, InterruptedException {

        int index = key.get();

        if (index < tblColRefs.length) {
            // merge dictionary
            TblColRef col = tblColRefs[index];
            List dictInfos = Lists.newArrayList();
            for (CubeSegment segment : mergingSegments) {
                if (segment.getDictResPath(col) != null) {
                    DictionaryInfo dictInfo = dictMgr.getDictionaryInfo(segment.getDictResPath(col));
                    if (dictInfo != null && !dictInfos.contains(dictInfo)) {
                        dictInfos.add(dictInfo);
                    }
                }
            }

            DictionaryInfo mergedDictInfo = dictMgr.mergeDictionary(dictInfos);
            String tblCol = col.getTableAlias() + ":" + col.getName();
            String dictInfoPath = mergedDictInfo == null ? "" : mergedDictInfo.getResourcePath();

            context.write(new IntWritable(-1), new Text(tblCol + "=" + dictInfoPath));

        } else {
            // merge statistics
            KylinConfig kylinConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(
                    new SerializableConfiguration(context.getConfiguration()),
                    context.getConfiguration().get(BatchConstants.ARG_META_URL));

            final String cubeName = context.getConfiguration().get(BatchConstants.ARG_CUBE_NAME);
            final String segmentId = context.getConfiguration().get(BatchConstants.ARG_SEGMENT_ID);
            final String statOutputPath = context.getConfiguration()
                    .get(MergeDictionaryJob.OPTION_OUTPUT_PATH_STAT.getOpt());
            CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);

            logger.info("Statistics output path: {}", statOutputPath);

            CubeSegment newSegment = cubeInstance.getSegmentById(segmentId);
            ResourceStore rs = ResourceStore.getStore(kylinConfig);

            Map cuboidHLLMap = Maps.newHashMap();
            Configuration conf = null;
            int averageSamplingPercentage = 0;

            for (CubeSegment cubeSegment : mergingSegments) {
                String filePath = cubeSegment.getStatisticsResourcePath();
                InputStream is = rs.getResource(filePath).content();
                File tempFile;
                FileOutputStream tempFileStream = null;

                try {
                    tempFile = File.createTempFile(segmentId, ".seq");
                    tempFileStream = new FileOutputStream(tempFile);
                    org.apache.commons.io.IOUtils.copy(is, tempFileStream);
                } finally {
                    IOUtils.closeStream(is);
                    IOUtils.closeStream(tempFileStream);
                }

                FileSystem fs = HadoopUtil.getFileSystem("file:///" + tempFile.getAbsolutePath());
                SequenceFile.Reader reader = null;
                try {
                    conf = HadoopUtil.getCurrentConfiguration();
                    //noinspection deprecation
                    reader = new SequenceFile.Reader(fs, new Path(tempFile.getAbsolutePath()), conf);
                    LongWritable keyW = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
                    BytesWritable valueW = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

                    while (reader.next(keyW, valueW)) {
                        if (keyW.get() == 0L) {
                            // sampling percentage;
                            averageSamplingPercentage += Bytes.toInt(valueW.getBytes());
                        } else if (keyW.get() > 0) {
                            HLLCounter hll = new HLLCounter(kylinConfig.getCubeStatsHLLPrecision());
                            ByteArray byteArray = new ByteArray(valueW.getBytes());
                            hll.readRegisters(byteArray.asBuffer());

                            if (cuboidHLLMap.get(keyW.get()) != null) {
                                cuboidHLLMap.get(keyW.get()).merge(hll);
                            } else {
                                cuboidHLLMap.put(keyW.get(), hll);
                            }
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                    throw e;
                } finally {
                    IOUtils.closeStream(reader);
                }
            }

            averageSamplingPercentage = averageSamplingPercentage / mergingSegments.size();
            CubeStatsWriter.writeCuboidStatistics(conf, new Path(statOutputPath), cuboidHLLMap,
                    averageSamplingPercentage);
            Path statisticsFilePath = new Path(statOutputPath,
                    BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);

            FileSystem fs = HadoopUtil.getFileSystem(statisticsFilePath, conf);
            FSDataInputStream fis = fs.open(statisticsFilePath);

            try {
                // put the statistics to metadata store
                String statisticsFileName = newSegment.getStatisticsResourcePath();
                rs.putResource(statisticsFileName, fis, System.currentTimeMillis());
            } finally {
                IOUtils.closeStream(fis);
            }

            context.write(new IntWritable(-1), new Text(""));
        }
    }

    private List getMergingSegments(CubeInstance cube, String[] segmentIds) {
        List result = Lists.newArrayListWithCapacity(segmentIds.length);
        for (String id : segmentIds) {
            result.add(cube.getSegmentById(id));
        }
        return result;
    }
}