org.apache.kylin.rest.service.SparkSourceService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kylin-datasource-service
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kylin.rest.service;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.stream.Collectors;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.exception.KylinException;
import org.apache.kylin.common.exception.ServerErrorCode;
import org.apache.kylin.common.msg.MsgPicker;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.StringHelper;
import org.apache.kylin.guava30.shaded.common.base.Strings;
import org.apache.kylin.guava30.shaded.common.collect.Lists;
import org.apache.kylin.guava30.shaded.common.collect.Maps;
import org.apache.kylin.metadata.model.NTableMetadataManager;
import org.apache.kylin.metadata.project.NProjectManager;
import org.apache.kylin.rest.request.DDLRequest;
import org.apache.kylin.rest.response.DDLResponse;
import org.apache.kylin.rest.response.ExportTablesResponse;
import org.apache.kylin.rest.response.TableNameResponse;
import org.apache.kylin.source.ISourceMetadataExplorer;
import org.apache.kylin.source.SourceFactory;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.DDLDesc;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.DdlOperation;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparderEnv;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalog.Database;
import org.apache.spark.sql.catalog.Table;
import org.apache.spark.sql.catalyst.TableIdentifier;
import org.apache.spark.sql.catalyst.catalog.CatalogTable;
import org.apache.spark.sql.delta.DeltaTableUtils;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.springframework.stereotype.Service;

import com.fasterxml.jackson.annotation.JsonProperty;

import lombok.Data;
import lombok.val;
import lombok.extern.slf4j.Slf4j;
import scala.Option;
import scala.collection.Iterator;

@Slf4j
@Service
public class SparkSourceService extends BasicService {

    //key in hive metadata map
    private static final String CHAR_VARCHAR_TYPE_STRING = "__CHAR_VARCHAR_TYPE_STRING";
    private static final String HIVE_COMMENT = "comment";

    // constants for samples
    private static final String TABLE_LINEORDER = "ssb.lineorder";
    private static final String VIEW_P_LINEORDER = "ssb.p_lineorder";
    private static final String CREATE_VIEW_P_LINEORDER = "create view if not exists SSB.P_LINEORDER as\n"
            + "        select LO_ORDERKEY,\n" + "        LO_LINENUMBER,\n" + "        LO_CUSTKEY,\n"
            + "        LO_PARTKEY,\n" + "        LO_SUPPKEY,\n" + "        LO_ORDERDATE,\n"
            + "        LO_ORDERPRIOTITY,\n" + "        LO_SHIPPRIOTITY,\n" + "        LO_QUANTITY,\n"
            + "        LO_EXTENDEDPRICE,\n" + "        LO_ORDTOTALPRICE,\n" + "        LO_DISCOUNT,\n"
            + "        LO_REVENUE,\n" + "        LO_SUPPLYCOST,\n" + "        LO_TAX,\n" + "        LO_COMMITDATE,\n"
            + "        LO_SHIPMODE,\n" + "        LO_EXTENDEDPRICE*LO_DISCOUNT as V_REVENUE\n"
            + "        from SSB.LINEORDER";

    public DDLResponse executeSQL(DDLRequest request) {
        List sqlList = Arrays.asList(request.getSql().split(";"));
        if (!Strings.isNullOrEmpty(request.getDatabase())) {
            executeSQL("use " + request.getDatabase());
        }
        DDLResponse ddlResponse = new DDLResponse();
        Map succeed = Maps.newHashMap();
        Map failed = Maps.newHashMap();
        sqlList.forEach(s -> {
            if (!Strings.isNullOrEmpty(s)) {
                try {
                    DDLDesc ddlDesc = executeSQL(s);
                    succeed.put(s, ddlDesc);
                } catch (Exception e) {
                    log.error("Failed to execute sql[{}]", s, e);
                    failed.put(s, e.getMessage());
                }
            }
        });
        ddlResponse.setSucceed(succeed);
        ddlResponse.setFailed(failed);
        return ddlResponse;
    }

    public DDLDesc executeSQL(String sql) {
        return DdlOperation.executeSQL(sql);
    }

    public void dropTable(String database, String table) throws AnalysisException {
        SparkSession ss = SparderEnv.getSparkSession();
        if (ss.catalog().tableExists(database, table)) {
            val t = ss.catalog().getTable(database, table);
            if ("view".equalsIgnoreCase(t.tableType())) {
                ss.sql(String.format(Locale.ROOT, "drop view %s.%s", database, table));
            } else {
                ss.sql(String.format(Locale.ROOT, "drop table %s.%s", database, table));
            }
        }
    }

    public List listDatabase() {
        SparkSession sparkSession = SparderEnv.getSparkSession();
        Dataset databaseSet = sparkSession.catalog().listDatabases();
        List databases = databaseSet.collectAsList();
        return databases.stream().map(database -> database.name().toUpperCase(Locale.ROOT))
                .collect(Collectors.toList());
    }

    public List listTables(String db, String project) throws Exception {
        if (Strings.isNullOrEmpty(project)) {
            SparkSession sparkSession = SparderEnv.getSparkSession();
            Dataset tableDataset = sparkSession.catalog().listTables(db);
            List sparkTables = tableDataset.collectAsList();
            return sparkTables.stream()
                    .map(table -> new TableNameResponse(table.name().toUpperCase(Locale.ROOT), false))
                    .collect(Collectors.toList());
        }
        ISourceMetadataExplorer explr = SourceFactory.getSource(getManager(NProjectManager.class).getProject(project))
                .getSourceMetadataExplorer();
        List tables = explr.listTables(db).stream().map(str -> str.toUpperCase(Locale.ROOT))
                .collect(Collectors.toList());
        List tableNameResponses = Lists.newArrayList();
        tables.forEach(table -> {
            TableNameResponse response = new TableNameResponse();
            response.setLoaded(getManager(NTableMetadataManager.class, project).getTableDesc(db + "." + table) != null);
            response.setTableName(table);
            tableNameResponses.add(response);
        });
        return tableNameResponses;
    }

    public List listColumns(String db, String table) {
        SparkSession sparkSession = SparderEnv.getSparkSession();
        CatalogTable catalogTable = sparkSession.sessionState().catalog()
                .getTempViewOrPermanentTableMetadata(new TableIdentifier(table, Option.apply(db)));
        scala.collection.immutable.List structFieldList = catalogTable.schema().toList();
        if (DeltaTableUtils.isDeltaTable(catalogTable)) {
            structFieldList = sparkSession.table(catalogTable.identifier()).schema().toList();
        }
        Iterator structFieldIterator = structFieldList.iterator();

        List columnModels = Lists.newArrayList();
        while (structFieldIterator.hasNext()) {
            StructField structField = structFieldIterator.next();
            String name = structField.name();
            String datatype = structField.dataType().simpleString();

            Metadata metadata = structField.metadata();

            ColumnModel columnModel = new ColumnModel();
            if (catalogTable.partitionColumnNames().contains(name)) {
                columnModel.setPartition(true);
            }
            columnModel.setName(name);
            columnModel.setDescription(metadata.contains(HIVE_COMMENT) ? metadata.getString(HIVE_COMMENT) : "");
            //use hive datatype if it exists , otherwise use spark datatype
            columnModel.setDataType(
                    metadata.contains(CHAR_VARCHAR_TYPE_STRING) ? metadata.getString(CHAR_VARCHAR_TYPE_STRING)
                            : datatype);
            columnModels.add(columnModel);
        }

        return columnModels;
    }

    public String getTableDesc(String database, String table) {
        return DdlOperation.getTableDesc(database, table);
    }

    public ExportTablesResponse exportTables(String database, String[] tables) {
        if (database == null || database.equals("")) {
            throw new KylinException(ServerErrorCode.INVALID_PARAMETER, MsgPicker.getMsg().getEmptyDatabase());
        }
        if (tables.length == 0) {
            throw new KylinException(ServerErrorCode.INVALID_PARAMETER, MsgPicker.getMsg().getEmptyTableList());
        }
        if (!databaseExists(database)) {
            throw new KylinException(ServerErrorCode.INVALID_PARAMETER,
                    String.format(Locale.ROOT, MsgPicker.getMsg().getDatabaseNotExist(), database));
        }
        val tableResponse = new ExportTablesResponse();
        Map tableDesc = Maps.newHashMap();
        for (String table : tables) {
            if (table.equals("")) {
                throw new KylinException(ServerErrorCode.INVALID_PARAMETER, MsgPicker.getMsg().getEmptyTableList());
            }
            if (!tableExists(database, table)) {
                throw new KylinException(ServerErrorCode.INVALID_PARAMETER,
                        String.format(Locale.ROOT, MsgPicker.getMsg().getTableNotFound(), table));
            }
            tableDesc.put(table, DdlOperation.getTableDesc(database, table).replaceAll("\t|\r|\n", " "));
        }
        tableResponse.setDatabase(database);
        tableResponse.setTables(tableDesc);
        return tableResponse;
    }

    public boolean databaseExists(String database) {
        SparkSession sparkSession = SparderEnv.getSparkSession();
        return sparkSession.catalog().databaseExists(database);
    }

    public boolean tableExists(String database, String table) {
        SparkSession sparkSession = SparderEnv.getSparkSession();
        return sparkSession.catalog().tableExists(database, table);
    }

    public boolean hasPartition(String database, String table) {
        return DdlOperation.hasPartition(database, table);
    }

    public List msck(String database, String table) {
        return DdlOperation.msck(database, table);
    }

    public List loadSamples(SparkSession ss, SaveMode mode) throws IOException, InterruptedException {
        Lock lock = KylinConfig.getInstanceFromEnv().getDistributedLockFactory().getLockForCurrentThread("samples");
        //list samples and use file-name as table name
        List fileList = listSampleFiles();
        List createdTables = Lists.newArrayList();
        lock.tryLock(60, TimeUnit.SECONDS);
        try {
            for (File file : fileList) {
                if (!file.isDirectory()) {
                    continue;
                }
                String fileName = file.getName(), table = fileName, tableName = table, db = "DEFAULT";
                if (fileName.contains(".")) {
                    String[] splits = fileName.split("\\.");
                    db = splits[0];
                    tableName = splits[1];
                    if (!ss.catalog().databaseExists(db)) {
                        ss.sql("create database if not exists " + db);
                    }
                } else {
                    table = String.format(Locale.ROOT, "%s.%s", db, table);
                }
                if (!ss.catalog().tableExists(table)) {
                    loadSamples(ss, mode, table, tableName, db, file, fileName);
                }
                createdTables.add(table);
            }
            if (ss.catalog().tableExists(TABLE_LINEORDER)) {
                ss.sql(CREATE_VIEW_P_LINEORDER);
                createdTables.add(VIEW_P_LINEORDER);
            }
            log.info("Load samples {} successfully", StringHelper.join(createdTables, ","));
        } finally {
            lock.unlock();
        }
        return createdTables;
    }

    private void loadSamples(SparkSession ss, SaveMode mode, String table, String tableName, String db, File file,
            String fileName) throws IOException {
        String filePath = file.getAbsolutePath();
        FileSystem fileSystem = HadoopUtil.getWorkingFileSystem();
        String hdfsPath = String.format(Locale.ROOT, "/tmp/%s", fileName);
        try {
            log.debug("Copy from {} to {}", filePath, hdfsPath);
            File[] parquetFiles = file.listFiles();
            if (parquetFiles != null) {
                for (File parquetFile : parquetFiles) {
                    fileSystem.copyFromLocalFile(new Path(parquetFile.getAbsolutePath()), new Path(hdfsPath));
                }
            }
            // N/A-6666, check and delete location
            String tbLocation = String.format(Locale.ROOT, "%s/%s", ss.catalog().getDatabase(db).locationUri(),
                    tableName);
            FileSystem fs = FileSystem.get(ss.sparkContext().hadoopConfiguration());
            Path path = new Path(tbLocation);
            if (fs.exists(path)) {
                log.debug("Delete existed table location {}", path.toString());
                fs.delete(path, true);
            }
            ss.read().parquet(hdfsPath).write().mode(mode).saveAsTable(table);
        } catch (Exception e) {
            log.error("Load sample {} failed.", fileName, e);
            throw new IllegalStateException(String.format(Locale.ROOT, "Load sample %s failed", fileName), e);
        } finally {
            fileSystem.delete(new Path(hdfsPath), false);
        }
    }

    public List loadSamples() throws IOException, InterruptedException {
        log.info("Start to load samples");
        SparkSession ss = SparderEnv.getSparkSession();
        return loadSamples(ss, SaveMode.Overwrite);
    }

    private List listSampleFiles() {
        //class_path samples
        String sampleDir = "../samples";
        if (KylinConfig.getInstanceFromEnv().isUTEnv()) {
            sampleDir = "../../build/samples";
        }
        File file = new File(sampleDir);
        log.debug("Samples file path is {}", file.getAbsolutePath());
        File[] listFiles = file.listFiles();
        if (!file.exists() || null == listFiles) {
            throw new RuntimeException("No sample data found.");
        }
        return Arrays.asList(listFiles);
    }

    @Data
    public static class ColumnModel {

        @JsonProperty("name")
        private String name;
        @JsonProperty("description")
        private String description;
        @JsonProperty("dataType")
        private String dataType;
        @JsonProperty("partition")
        private boolean partition;
    }
}
    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api
                
            
        
    
    





    © 2015 - 2024 Weber Informatics LLC | Privacy Policy