org.apache.kylin.engine.mr.steps.ExtractDictionaryFromGlobalMapper Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kylin.engine.mr.steps;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.cube.model.CubeJoinedFlatTableEnrich;
import org.apache.kylin.dict.ShrunkenDictionary;
import org.apache.kylin.dict.ShrunkenDictionaryBuilder;
import org.apache.kylin.engine.EngineFactory;
import org.apache.kylin.engine.mr.IMRInput;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.MRUtil;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.DictionaryGetterUtil;
import org.apache.kylin.metadata.model.TblColRef;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class ExtractDictionaryFromGlobalMapper extends KylinMapper {
private String cubeName;
private CubeDesc cubeDesc;
private CubeInstance cube;
private CubeSegment cubeSeg;
private IMRInput.IMRTableInputFormat flatTableInputFormat;
private CubeJoinedFlatTableEnrich intermediateTableDesc;
private List globalColumns;
private int[] globalColumnIndex;
private List> globalColumnValues;
private List> globalDicts;
private String splitKey;
private KylinConfig config;
@Override
protected void doSetup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
bindCurrentConfiguration(conf);
config = AbstractHadoopJob.loadKylinPropsAndMetadata();
cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
cube = CubeManager.getInstance(config).getCube(cubeName);
cubeDesc = cube.getDescriptor();
cubeSeg = cube.getSegmentById(conf.get(BatchConstants.CFG_CUBE_SEGMENT_ID));
flatTableInputFormat = MRUtil.getBatchCubingInputSide(cubeSeg).getFlatTableInputFormat();
intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSeg), cubeDesc);
globalColumns = cubeDesc.getAllGlobalDictColumns();
globalColumnIndex = new int[globalColumns.size()];
globalColumnValues = Lists.newArrayListWithExpectedSize(globalColumns.size());
for (int i = 0; i < globalColumns.size(); i++) {
TblColRef colRef = globalColumns.get(i);
int columnIndexOnFlatTbl = intermediateTableDesc.getColumnIndex(colRef);
globalColumnIndex[i] = columnIndexOnFlatTbl;
globalColumnValues.add(Sets. newHashSet());
}
splitKey = DictionaryGetterUtil.getInputSplitSignature(cubeSeg, context.getInputSplit());
}
@Override
public void doMap(KEYIN key, Object record, Context context) throws IOException, InterruptedException {
Collection rowCollection = flatTableInputFormat.parseMapperInput(record);
for (String[] row : rowCollection) {
for (int i = 0; i < globalColumnIndex.length; i++) {
String fieldValue = row[globalColumnIndex[i]];
if (fieldValue == null)
continue;
globalColumnValues.get(i).add(fieldValue);
}
}
}
@Override
protected void doCleanup(Context context) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Path outputDirBase = new Path(context.getConfiguration().get(FileOutputFormat.OUTDIR));
globalDicts = Lists.newArrayListWithExpectedSize(globalColumns.size());
Map> dictionaryMap = cubeSeg.buildDictionaryMap();
for (int i = 0; i < globalColumns.size(); i++) {
TblColRef colRef = globalColumns.get(i);
globalDicts.add(dictionaryMap.get(colRef));
}
ShrunkenDictionary.StringValueSerializer strValueSerializer = new ShrunkenDictionary.StringValueSerializer();
for (int i = 0; i < globalColumns.size(); i++) {
List colDistinctValues = Lists.newArrayList(globalColumnValues.get(i));
if (colDistinctValues.size() == 0) {
continue;
}
// sort values to accelerate the encoding process by reducing the swapping of global dictionary slices
Collections.sort(colDistinctValues);
ShrunkenDictionaryBuilder dictBuilder = new ShrunkenDictionaryBuilder<>(globalDicts.get(i));
for (String colValue : colDistinctValues) {
dictBuilder.addValue(colValue);
}
Dictionary shrunkenDict = dictBuilder.build(strValueSerializer);
Path colDictDir = new Path(outputDirBase, globalColumns.get(i).getIdentity());
if (!fs.exists(colDictDir)) {
fs.mkdirs(colDictDir);
}
try (DataOutputStream dos = fs.create(new Path(colDictDir, splitKey))) {
shrunkenDict.write(dos);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy