org.apache.kylin.engine.mr.steps.UHCDictionaryReducer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kylin.engine.mr.steps;
import static org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer.DICT_FILE_POSTFIX;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.ArrayPrimitiveWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.ClassUtil;
import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.dict.DictionaryGenerator;
import org.apache.kylin.dict.DictionaryInfo;
import org.apache.kylin.dict.IDictionaryBuilder;
import org.apache.kylin.engine.mr.KylinReducer;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class UHCDictionaryReducer extends KylinReducer {
private static final Logger logger = LoggerFactory.getLogger(UHCDictionaryReducer.class);
private IDictionaryBuilder builder;
private TblColRef col;
private MultipleOutputs mos;
@Override
protected void doSetup(Context context) throws IOException {
super.bindCurrentConfiguration(context.getConfiguration());
Configuration conf = context.getConfiguration();
mos = new MultipleOutputs(context);
KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
CubeDesc cubeDesc = cube.getDescriptor();
List uhcColumns = cubeDesc.getAllUHCColumns();
int taskId = context.getTaskAttemptID().getTaskID().getId();
col = uhcColumns.get(taskId);
logger.info("column name: " + col.getIdentity());
if (cube.getDescriptor().getShardByColumns().contains(col)) {
//for ShardByColumns
builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
builder.init(null, 0, null);
} else {
//for GlobalDictionaryColumns
String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
String builderClass = cubeDesc.getDictionaryBuilderClass(col);
builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
builder.init(dictionaryInfo, 0, hdfsDir);
}
}
@Override
public void doReduce(SelfDefineSortableKey skey, Iterable values, Context context) throws IOException, InterruptedException {
Text key = skey.getText();
String value = Bytes.toString(key.getBytes(), 1, key.getLength() - 1);
builder.addValue(value);
}
@Override
protected void doCleanup(Context context) throws IOException, InterruptedException {
Dictionary dict = builder.build();
outputDict(col, dict);
}
private void outputDict(TblColRef col, Dictionary dict) throws IOException, InterruptedException {
// output written to baseDir/colName/colName.rldict-r-00000 (etc)
String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) {
outputStream.writeUTF(dict.getClass().getName());
dict.write(outputStream);
mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
}
mos.close();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy