All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.table.WorkloadProfile Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 * Copyright (c) 2016 Uber Technologies, Inc. ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.table;


import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.PairFunction;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import scala.Option;
import scala.Tuple2;

/**
 * Information about incoming records for upsert/insert obtained either via sampling or
 * introspecting the data fully
 *
 * TODO(vc): Think about obtaining this directly from index.tagLocation
 */
public class WorkloadProfile implements Serializable {

    /**
     * Input workload
     */
    private final JavaRDD> taggedRecords;

    /**
     * Computed workload profile
     */
    private final HashMap partitionPathStatMap;


    private final WorkloadStat globalStat;


    public WorkloadProfile(JavaRDD> taggedRecords) {
        this.taggedRecords = taggedRecords;
        this.partitionPathStatMap = new HashMap<>();
        this.globalStat = new WorkloadStat();
        buildProfile();
    }

    private void buildProfile() {

        Map>, Long> partitionLocationCounts = taggedRecords
                .mapToPair(record ->
                        new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record))
                .countByKey();

        for (Map.Entry>, Long> e: partitionLocationCounts.entrySet()) {
            String partitionPath = e.getKey()._1();
            Long count = e.getValue();
            Option locOption = e.getKey()._2();

            if (!partitionPathStatMap.containsKey(partitionPath)){
                partitionPathStatMap.put(partitionPath, new WorkloadStat());
            }

            if (locOption.isDefined()) {
                // update
                partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
                globalStat.addUpdates(locOption.get(), count);
            } else {
                // insert
                partitionPathStatMap.get(partitionPath).addInserts(count);
                globalStat.addInserts(count);
            }
        }
    }

    public WorkloadStat getGlobalStat() {
        return globalStat;
    }

    public Set getPartitionPaths() {
        return partitionPathStatMap.keySet();
    }

    public WorkloadStat getWorkloadStat(String partitionPath){
        return partitionPathStatMap.get(partitionPath);
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("WorkloadProfile {");
        sb.append("globalStat=").append(globalStat).append(", ");
        sb.append("partitionStat=").append(partitionPathStatMap);
        sb.append('}');
        return sb.toString();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy