All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netflix.metacat.main.services.search.ElasticSearchMetacatRefresh Maven / Gradle / Ivy

There is a newer version: 1.3.1
Show newest version
/*
 * Copyright 2016 Netflix, Inc.
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *        http://www.apache.org/licenses/LICENSE-2.0
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

package com.netflix.metacat.main.services.search;

import com.facebook.presto.spi.NotFoundException;
import com.facebook.presto.spi.Pageable;
import com.facebook.presto.spi.Sort;
import com.facebook.presto.spi.SortOrder;
import com.google.common.base.Functions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.AsyncFunction;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.netflix.metacat.common.MetacatContext;
import com.netflix.metacat.common.QualifiedName;
import com.netflix.metacat.common.dto.CatalogDto;
import com.netflix.metacat.common.dto.CatalogMappingDto;
import com.netflix.metacat.common.dto.DatabaseDto;
import com.netflix.metacat.common.dto.PartitionDto;
import com.netflix.metacat.common.dto.TableDto;
import com.netflix.metacat.common.exception.MetacatNotFoundException;
import com.netflix.metacat.common.monitoring.CounterWrapper;
import com.netflix.metacat.common.monitoring.TimerWrapper;
import com.netflix.metacat.common.server.Config;
import com.netflix.metacat.common.usermetadata.UserMetadataService;
import com.netflix.metacat.common.util.MetacatContextManager;
import com.netflix.metacat.main.services.CatalogService;
import com.netflix.metacat.main.services.DatabaseService;
import com.netflix.metacat.main.services.PartitionService;
import com.netflix.metacat.main.services.TableService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collectors;

import static com.netflix.metacat.main.services.search.ElasticSearchDoc.Type.catalog;
import static com.netflix.metacat.main.services.search.ElasticSearchDoc.Type.database;
import static com.netflix.metacat.main.services.search.ElasticSearchDoc.Type.partition;
import static com.netflix.metacat.main.services.search.ElasticSearchDoc.Type.table;

/**
 * This class does a refresh of all the metadata entities from original data sources to elastic search
 * Created by amajumdar on 8/20/15.
 */
public class ElasticSearchMetacatRefresh {
    private static final Logger log = LoggerFactory.getLogger(ElasticSearchMetacatRefresh.class);
    private static AtomicBoolean isElasticSearchMetacatRefreshAlreadyRunning = new AtomicBoolean(false);
    public static final Predicate notNull = o -> o != null;
    private String refreshMarker;
    @Inject
    CatalogService catalogService;
    @Inject
    Config config;
    @Inject
    DatabaseService databaseService;
    @Inject
    TableService tableService;
    @Inject
    PartitionService partitionService;
    @Inject
    ElasticSearchUtil elasticSearchUtil;
    @Inject
    UserMetadataService userMetadataService;
    //  Fixed thread pool
    ListeningExecutorService service;
    ListeningExecutorService esService;

    public void process(){
        List catalogNames = getCatalogNamesToRefresh();
        List qNames = catalogNames.stream()
                .map(QualifiedName::ofCatalog).collect(Collectors.toList());
        _process(qNames, () -> _processCatalogs(catalogNames), "process", true, 1000);
    }

    public void processCatalogs(List catalogNames){
        List qNames = catalogNames.stream()
                .map(QualifiedName::ofCatalog).collect(Collectors.toList());
        _process( qNames, () -> _processCatalogs(catalogNames), "processCatalogs", true, 1000);
    }

    public void processDatabases(String catalogName, List databaseNames){
        List qNames = databaseNames.stream()
                .map(s -> QualifiedName.ofDatabase(catalogName, s)).collect(Collectors.toList());
        _process(qNames, () -> _processDatabases(QualifiedName.ofCatalog(catalogName), qNames), "processDatabases", true, 1000);
    }

    public void processPartitions(){
        List catalogNames = Splitter.on(',').omitEmptyStrings().trimResults()
                .splitToList(config.getElasticSearchRefreshPartitionsIncludeCatalogs());
        List qNames = catalogNames.stream()
                .map(QualifiedName::ofCatalog).collect(Collectors.toList());
        _process( qNames, () -> _processPartitions(qNames), "processPartitions", false, 500);
    }

    private ListenableFuture _processPartitions(List qNames) {
        List tables = elasticSearchUtil.getTableIdsByCatalogs(table.name(), qNames);
        List>> futures = tables.stream().map(s -> service.submit(() -> {
            QualifiedName tableName = QualifiedName.fromString(s, false);
            List> indexFutures = Lists.newArrayList();
            int offset = 0;
            int count;
            Sort sort = null;
            if ("s3".equals(tableName.getCatalogName()) || "aegisthus".equals(tableName.getCatalogName())) {
                sort = new Sort("id", SortOrder.ASC);
            } else {
                sort = new Sort("part_id", SortOrder.ASC);
            }
            Pageable pageable = new Pageable(10000, offset);
            do {
                List partitionDtos = partitionService.list(tableName, null, null, sort, pageable, true,
                        true, true);
                count = partitionDtos.size();
                if (!partitionDtos.isEmpty()) {
                    List> partitionedPartitionDtos = Lists.partition(partitionDtos, 1000);
                    partitionedPartitionDtos.forEach(
                            subPartitionsDtos -> indexFutures.add(indexPartitionDtos(tableName, subPartitionsDtos)));
                    offset = offset + count;
                    pageable.setOffset( offset);
                }
            } while (count == 10000);
            return Futures.transform(Futures.successfulAsList(indexFutures), Functions.constant((Void) null));
        })).collect(Collectors.toList());
        return Futures.transform(Futures.successfulAsList(futures),
                (AsyncFunction>, Void>) input -> {
                    List> inputFuturesWithoutNulls = input.stream().filter(notNull).collect(Collectors.toList());
                    return Futures.transform(Futures.successfulAsList(inputFuturesWithoutNulls), Functions.constant(null));
                });
    }

    private static ExecutorService newFixedThreadPool(int nThreads, String threadFactoryName, int queueSize) {
        return new ThreadPoolExecutor(nThreads, nThreads,
                0L, TimeUnit.MILLISECONDS,
                new LinkedBlockingQueue<>(queueSize),
                new ThreadFactoryBuilder()
                        .setNameFormat(threadFactoryName)
                        .build(),
                (r, executor) -> {
                    // this will block if the queue is full
                    try {
                        executor.getQueue().put(r);
                    } catch (InterruptedException e) {
                        throw Throwables.propagate(e);
                    }
                });
    }

    private void _process(List qNames, Supplier> supplier, String requestName, boolean delete, int queueSize){
        if( isElasticSearchMetacatRefreshAlreadyRunning.compareAndSet( false, true)) {
            TimerWrapper timer = TimerWrapper.createStarted("dse.metacat.timer.ElasticSearchMetacatRefresh." + requestName);
            try {
                log.info("Start: Full refresh of metacat index in elastic search. Processing {} ...", qNames);
                MetacatContext context = new MetacatContext( "admin", "elasticSearchRefresher", null, null, null);
                MetacatContextManager.setContext(context);
                refreshMarker = Instant.now().toString();
                service = MoreExecutors.listeningDecorator(newFixedThreadPool(50, "elasticsearch-refresher-%d", queueSize));
                esService = MoreExecutors.listeningDecorator(newFixedThreadPool(5, "elasticsearch-refresher-es-%d", queueSize));
                supplier.get().get(24, TimeUnit.HOURS);
                log.info("End: Full refresh of metacat index in elastic search");
                if( delete) {
                    deleteUnmarkedEntities(qNames);
                }
            } catch (Exception e) {
                log.error("Full refresh of metacat index failed", e);
                CounterWrapper.incrementCounter("dse.metacat.elasticSearchMetacatRefreshFailureCount");
            } finally {
                try {
                    shutdown(service);
                    shutdown(esService);
                } finally {
                    isElasticSearchMetacatRefreshAlreadyRunning.set(false);
                    log.info("### Time taken to complete {} is {} ms", requestName, timer.stop());
                }
            }

        } else {
            log.info("Full refresh of metacat index is already running.");
            CounterWrapper.incrementCounter("dse.metacat.elasticSearchMetacatRefreshAlreadyRunning");
        }
    }

    private void shutdown(ListeningExecutorService executorService) {
        if( executorService != null){
            executorService.shutdown();
            try {
                // Wait a while for existing tasks to terminate
                if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
                    executorService.shutdownNow(); // Cancel currently executing tasks
                    // Wait a while for tasks to respond to being cancelled
                    if (!executorService.awaitTermination(60, TimeUnit.SECONDS))
                        log.warn("Thread pool for metacat refresh did not terminate");
                }
            } catch (InterruptedException ie) {
                // (Re-)Cancel if current thread also interrupted
                executorService.shutdownNow();
                // Preserve interrupt status
                Thread.currentThread().interrupt();
            }
        }
    }

    private void deleteUnmarkedEntities(List qNames) {
        log.info("Start: Delete unmarked entities");
        //
        // get unmarked qualified names
        // check if it not exists
        // delete
        //
        elasticSearchUtil.refresh();
        MetacatContext context = new MetacatContext("admin", "metacat-refresh", null, null, null);
        List unmarkedDatabaseDtos = elasticSearchUtil.getQualifiedNamesByMarkerByNames("database", qNames, refreshMarker, DatabaseDto.class);
        if( !unmarkedDatabaseDtos.isEmpty()) {
            if(unmarkedDatabaseDtos.size() <= config.getElasticSearchThresholdUnmarkedDatabasesDelete()) {
                log.info("Start: Delete unmarked databases({})", unmarkedDatabaseDtos.size());
                List unmarkedDatabaseNames = Lists.newArrayList();
                List deleteDatabaseDtos = unmarkedDatabaseDtos.stream().filter(databaseDto -> {
                    boolean result = false;
                    try {
                        unmarkedDatabaseNames.add(databaseDto.getName().toString());
                        DatabaseDto dto = databaseService.get(databaseDto.getName(), false);
                        if (dto == null) {
                            result = true;
                        }
                    } catch (NotFoundException | MetacatNotFoundException ignored) {
                        result = true;
                    } catch (Exception ignored) {}
                    return result;
                }).collect(Collectors.toList());
                log.info("Unmarked databases({}): {}", unmarkedDatabaseNames.size(), unmarkedDatabaseNames);
                log.info("Deleting databases({})", deleteDatabaseDtos.size());
                if (!deleteDatabaseDtos.isEmpty()) {
                    List deleteDatabaseQualifiedNames = deleteDatabaseDtos.stream()
                            .map(DatabaseDto::getName)
                            .collect(Collectors.toList());
                    List deleteDatabaseNames = deleteDatabaseQualifiedNames.stream().map(
                            QualifiedName::toString).collect(Collectors.toList());
                    log.info("Deleting databases({}): {}", deleteDatabaseNames.size(), deleteDatabaseNames);
                    userMetadataService.deleteDefinitionMetadatas(deleteDatabaseQualifiedNames);
                    elasticSearchUtil.softDelete("database", deleteDatabaseNames, context);
                }
                log.info("End: Delete unmarked databases({})", unmarkedDatabaseDtos.size());
            }else {
                log.info("Count of unmarked databases({}) is more than the threshold {}", unmarkedDatabaseDtos.size(), config.getElasticSearchThresholdUnmarkedDatabasesDelete());
                CounterWrapper.incrementCounter("dse.metacat.counter.unmarked.databases.threshold.crossed");
            }
        }

        List unmarkedTableDtos = elasticSearchUtil.getQualifiedNamesByMarkerByNames("table", qNames, refreshMarker, TableDto.class);
        if( !unmarkedTableDtos.isEmpty() ) {
            if(unmarkedTableDtos.size() <= config.getElasticSearchThresholdUnmarkedTablesDelete()) {
                log.info("Start: Delete unmarked tables({})", unmarkedTableDtos.size());
                List unmarkedTableNames = Lists.newArrayList();
                List deleteTableDtos = unmarkedTableDtos.stream().filter(tableDto -> {
                    boolean result = false;
                    try {
                        unmarkedTableNames.add(tableDto.getName().toString());
                        Optional dto = tableService.get(tableDto.getName(), false);
                        if (!dto.isPresent()) {
                            result = true;
                        }
                    } catch (NotFoundException | MetacatNotFoundException ignored) {
                        result = true;
                    } catch (Exception ignored) {}
                    return result;
                }).collect(Collectors.toList());
                log.info("Unmarked tables({}): {}", unmarkedTableNames.size(), unmarkedTableNames);
                log.info("Deleting tables({}): {}", deleteTableDtos.size());
                if (!deleteTableDtos.isEmpty()) {
                    List deleteTableNames = deleteTableDtos.stream().map(
                            dto -> dto.getName().toString()).collect(Collectors.toList());
                    log.info("Deleting tables({}): {}", deleteTableNames.size(), deleteTableNames);
                    userMetadataService.deleteMetadatas(Lists.newArrayList(deleteTableDtos), false);
                    elasticSearchUtil.softDelete("table", deleteTableNames, context);
                }
                log.info("End: Delete unmarked tables({})", unmarkedTableDtos.size());
            } else {
                log.info("Count of unmarked tables({}) is more than the threshold {}", unmarkedTableDtos.size(), config.getElasticSearchThresholdUnmarkedTablesDelete());
                CounterWrapper.incrementCounter("dse.metacat.counter.unmarked.tables.threshold.crossed");

            }
        }
        log.info("End: Delete unmarked entities");
    }

    private ListenableFuture _processCatalogs(List catalogNames){
        log.info("Start: Full refresh of catalogs: {}", catalogNames);
        List> getCatalogFutures = catalogNames.stream()
                .map(catalogName -> service.submit(() -> {
                    CatalogDto result = null;
                    try {
                        result = getCatalog(catalogName);
                    } catch (Exception e) {
                        log.error("Failed to retrieve catalog: {}", catalogName);
                        elasticSearchUtil.log("ElasticSearchMetacatRefresh.getCatalog", catalog.name(), catalogName, null, e.getMessage(), e, true);
                    }
                    return result;
                }))
                .collect(Collectors.toList());
        return Futures.transform( Futures.successfulAsList(getCatalogFutures),
                (AsyncFunction, Void>) input -> {
                    List> processCatalogFutures = input.stream().filter(notNull).map(
                            catalogDto -> {
                                List databaseNames = getDatabaseNamesToRefresh(catalogDto);
                                return _processDatabases(catalogDto.getName(), databaseNames);
                            }).filter(notNull).collect(Collectors.toList());
                    return Futures.transform(Futures.successfulAsList(processCatalogFutures), Functions.constant(null));
                });
    }

    private List getDatabaseNamesToRefresh(CatalogDto catalogDto) {
        List databasesToRefresh = catalogDto.getDatabases();
        if(!Strings.isNullOrEmpty(config.getElasticSearchRefreshIncludeDatabases())){
            List refreshDatabaseNames = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(
                    config.getElasticSearchRefreshIncludeDatabases());
            databasesToRefresh = databasesToRefresh.stream()
                    .filter(refreshDatabaseNames::contains)
                    .collect(Collectors.toList());
        }
        if(!Strings.isNullOrEmpty(config.getElasticSearchRefreshExcludeDatabases())){
            List excludeDatabaseNames = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(
                    config.getElasticSearchRefreshExcludeDatabases());
            databasesToRefresh = databasesToRefresh.stream()
                    .filter(databaseName -> !excludeDatabaseNames.contains(databaseName))
                    .collect(Collectors.toList());
        }
        return databasesToRefresh.stream()
                .map(s -> QualifiedName.ofDatabase(catalogDto.getName().getCatalogName(), s))
                .collect(Collectors.toList());
    }

    private List getCatalogNamesToRefresh() {
        List result = null;
        if(!Strings.isNullOrEmpty(config.getElasticSearchRefreshIncludeCatalogs())){
            result = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(config.getElasticSearchRefreshIncludeCatalogs());
        } else {
            result = getCatalogNames();
        }
        return result;
    }

    /**
     * Process the list of databases
     * @param catalogName catalog name
     * @param databaseNames database names
     * @return future
     */
    private ListenableFuture _processDatabases(QualifiedName catalogName, List databaseNames){
        ListenableFuture resultFuture = null;
        log.info("Full refresh of catalog {} for databases({}): {}", catalogName, databaseNames.size(), databaseNames);
        List> getDatabaseFutures = databaseNames.stream()
                .map(databaseName -> service.submit(() -> {
                    DatabaseDto result = null;
                    try {
                        result = getDatabase( databaseName);
                    } catch (Exception e) {
                        log.error("Failed to retrieve database: {}", databaseName);
                        elasticSearchUtil.log("ElasticSearchMetacatRefresh.getDatabase", database.name(), databaseName.toString(), null, e.getMessage(), e, true);
                    }
                    return result;
                }))
                .collect(Collectors.toList());

        if( getDatabaseFutures != null && !getDatabaseFutures.isEmpty()) {
            resultFuture =  Futures.transform(Futures.successfulAsList(getDatabaseFutures),
                    (AsyncFunction, Void>) input -> {
                        ListenableFuture processDatabaseFuture = indexDatabaseDtos(catalogName, input);
                        List> processDatabaseFutures = input.stream().filter(notNull)
                                .map(databaseDto -> {
                                    List tableNames = databaseDto.getTables().stream()
                                            .map(s -> QualifiedName.ofTable(databaseDto.getName().getCatalogName(),
                                                    databaseDto.getName().getDatabaseName(), s))
                                            .collect(Collectors.toList());
                                    log.info("Full refresh of database {} for tables({}): {}", databaseDto.getName().toString(), databaseDto.getTables().size(), databaseDto.getTables());
                                    return processTables(databaseDto.getName(), tableNames);
                                }).filter(notNull).collect(Collectors.toList());
                        processDatabaseFutures.add(processDatabaseFuture);
                        return Futures.transform(Futures.successfulAsList(processDatabaseFutures), Functions.constant(null));
                    });
         }

        return resultFuture;
    }

    /**
     * Save all databases to index it in elastic search
     * @param catalogName catalog name
     * @param dtos database dtos
     * @return future
     */
    private ListenableFuture indexDatabaseDtos(QualifiedName catalogName, List dtos){
        return esService.submit(() -> {
            List docs = dtos.stream()
                    .map( dto -> new ElasticSearchDoc( dto.getName().toString(), dto, "admin", false, refreshMarker))
                    .collect(Collectors.toList());
            log.info("Saving databases for catalog: {}", catalogName);
            elasticSearchUtil.save(database.name(), docs);
            return null;
        });
    }

    /**
     * Process the list of tables in batches
     * @param databaseName database name
     * @param tableNames table names
     * @return A future containing the tasks
     */
    private ListenableFuture processTables(QualifiedName databaseName, List tableNames){
        List> tableNamesBatches = Lists.partition( tableNames, 500);
        List> processTablesBatchFutures = tableNamesBatches.stream().map(
                subTableNames -> _processTables( databaseName, subTableNames)).collect(Collectors.toList());

        return Futures.transform(Futures.successfulAsList(processTablesBatchFutures),Functions.constant(null));
    }

    private ListenableFuture _processTables(QualifiedName databaseName, List tableNames){
        List>> getTableFutures = tableNames.stream()
                .map(tableName -> service.submit(() -> {
                    Optional result = null;
                    try {
                        result = getTable( tableName);
                    } catch (Exception e) {
                        log.error("Failed to retrieve table: {}", tableName);
                        elasticSearchUtil.log("ElasticSearchMetacatRefresh.getTable", table.name(), tableName.toString(), null, e.getMessage(), e, true);
                    }
                    return result;
                }))
                .collect(Collectors.toList());

        return Futures.transform(Futures.successfulAsList(getTableFutures),
                (AsyncFunction>, Void>) input -> indexTableDtos( databaseName, input));
    }

    /**
     * Save all tables to index it in elastic search
     * @param databaseName database name
     * @param dtos table dtos
     * @return future
     */
    private ListenableFuture indexTableDtos(QualifiedName databaseName, List> dtos){
        return esService.submit(() -> {
            List docs = dtos.stream().filter(dto -> dto!= null && dto.isPresent()).map(
                    tableDtoOptional -> {
                        TableDto dto = tableDtoOptional.get();
                        String userName = dto.getAudit() != null ? dto.getAudit().getCreatedBy()
                                : "admin";
                        return new ElasticSearchDoc(dto.getName().toString(), dto, userName, false, refreshMarker);
                    }).collect(Collectors.toList());
            log.info("Saving tables for database: {}", databaseName);
            elasticSearchUtil.save(table.name(), docs);
            return null;
        });
    }

    /**
     * Save all tables to index it in elastic search
     * @param tableName database name
     * @param dtos partition dtos
     * @return future
     */
    private ListenableFuture indexPartitionDtos(QualifiedName tableName, List dtos){
        return esService.submit(() -> {
            List docs = dtos.stream().filter(dto -> dto!=null).map(
                    dto -> {
                        String userName = dto.getAudit() != null ? dto.getAudit().getCreatedBy()
                                : "admin";
                        return new ElasticSearchDoc(dto.getName().toString(), dto, userName, false, refreshMarker);
                    }).collect(Collectors.toList());
            log.info("Saving partitions for tableName: {}", tableName);
            elasticSearchUtil.save(partition.name(), docs);
            return null;
        });
    }

    protected List getCatalogNames() {
        return catalogService.getCatalogNames().stream().map(CatalogMappingDto::getCatalogName).collect(
                Collectors.toList());
    }

    protected CatalogDto getCatalog(String catalogName) {
        return catalogService.get(QualifiedName.ofCatalog(catalogName));
    }

    protected DatabaseDto getDatabase(QualifiedName databaseName) {
        return databaseService.get(databaseName, true);
    }

    protected Optional getTable(QualifiedName tableName) {
        return tableService.get(tableName, true);
    }
}