All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.kylin.source.hive.MRHiveDictUtil Maven / Gradle / Ivy

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.kylin.source.hive;

import com.google.common.collect.ImmutableList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.livy.LivyRestBuilder;
import org.apache.kylin.common.livy.LivyRestExecutor;
import org.apache.kylin.common.livy.LivyTypeEnum;
import org.apache.kylin.job.JoinedFlatTable;
import org.apache.kylin.job.common.PatternedLogger;
import org.apache.kylin.job.constant.ExecutableConstants;
import org.apache.kylin.job.execution.ExecutableManager;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MRHiveDictUtil {
    private static final Logger logger = LoggerFactory.getLogger(MRHiveDictUtil.class);
    protected static final Pattern HDFS_LOCATION = Pattern.compile("LOCATION \'(.*)\';");

    public enum DictHiveType {
        GroupBy("group_by"), MrDictLockPath("/mr_dict_lock/");
        private String name;

        DictHiveType(String name) {
            this.name = name;
        }

        public String getName() {
            return name;
        }
    }

    public static String generateDropTableStatement(IJoinedFlatTableDesc flatDesc) {
        StringBuilder ddl = new StringBuilder();
        String table = getHiveTableName(flatDesc, DictHiveType.GroupBy);
        ddl.append("DROP TABLE IF EXISTS " + table + ";").append(" \n");
        return ddl.toString();
    }

    public static String generateCreateTableStatement(IJoinedFlatTableDesc flatDesc) {
        StringBuilder ddl = new StringBuilder();
        String table = getHiveTableName(flatDesc, DictHiveType.GroupBy);

        ddl.append("CREATE TABLE IF NOT EXISTS " + table + " \n");
        ddl.append("( \n ");
        ddl.append("dict_key" + " " + "STRING" + " COMMENT '' \n");
        ddl.append(") \n");
        ddl.append("COMMENT '' \n");
        ddl.append("PARTITIONED BY (dict_column string) \n");
        ddl.append("STORED AS SEQUENCEFILE \n");
        ddl.append(";").append("\n");
        return ddl.toString();
    }

    public static String generateInsertDataStatement(IJoinedFlatTableDesc flatDesc, String dictColumn) {
        String table = getHiveTableName(flatDesc, DictHiveType.GroupBy);

        StringBuilder sql = new StringBuilder();
        sql.append("SELECT" + "\n");

        int index = 0;
        for (TblColRef tblColRef : flatDesc.getAllColumns()) {
            if (JoinedFlatTable.colName(tblColRef, flatDesc.useAlias()).equalsIgnoreCase(dictColumn)) {
                break;
            }
            index++;
        }

        if (index == flatDesc.getAllColumns().size()) {
            String msg = "Can not find correct column for " + dictColumn + ", please check 'kylin.dictionary.mr-hive.columns'";
            logger.error(msg);
            throw new IllegalArgumentException(msg);
        }

        TblColRef col = flatDesc.getAllColumns().get(index);
        sql.append(JoinedFlatTable.colName(col) + " \n");

        MRHiveDictUtil.appendJoinStatement(flatDesc, sql);

        //group by
        sql.append("GROUP BY ");
        sql.append(JoinedFlatTable.colName(col) + " \n");

        return "INSERT OVERWRITE TABLE " + table + " \n"
                + "PARTITION (dict_column = '" + dictColumn + "')" + " \n"
                + sql + ";\n";
    }

    public static String getHiveTableName(IJoinedFlatTableDesc flatDesc, DictHiveType dictHiveType) {
        StringBuffer table = new StringBuffer(flatDesc.getTableName());
        table.append("__");
        table.append(dictHiveType.getName());
        return table.toString();
    }

    public static void appendJoinStatement(IJoinedFlatTableDesc flatDesc, StringBuilder sql) {
        sql.append("FROM " + flatDesc.getTableName() + "\n");
    }

    public static void runLivySqlJob(PatternedLogger stepLogger, KylinConfig config, ImmutableList sqls,
            ExecutableManager executableManager, String jobId) throws IOException {
        final LivyRestBuilder livyRestBuilder = new LivyRestBuilder();
        livyRestBuilder.overwriteHiveProps(config.getHiveConfigOverride());
        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append(livyRestBuilder.parseProps());
        for (String sql : sqls) {
            stringBuilder.append(sql);
        }
        String args = stringBuilder.toString();
        livyRestBuilder.addArgs(args);

        stepLogger.log("Create and distribute table. ");
        livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.sql);

        LivyRestExecutor executor = new LivyRestExecutor();
        executor.execute(livyRestBuilder, stepLogger);

        Map info = stepLogger.getInfo();
        //get the flat Hive table size
        Matcher matcher = HDFS_LOCATION.matcher(args);
        if (matcher.find()) {
            String hiveFlatTableHdfsUrl = matcher.group(1);
            long size = getFileSize(hiveFlatTableHdfsUrl);
            info.put(ExecutableConstants.HDFS_BYTES_WRITTEN, "" + size);
            logger.info("HDFS_Bytes_Writen: {}", size);
        }
        executableManager.addJobInfo(jobId, info);
    }

    private static long getFileSize(String hdfsUrl) throws IOException {
        Configuration configuration = new Configuration();
        Path path = new Path(hdfsUrl);
        FileSystem fs = path.getFileSystem(configuration);
        ContentSummary contentSummary = fs.getContentSummary(path);
        return contentSummary.getLength();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy