net.opentsdb.core.SaltScanner.orig Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opentsdb Show documentation
OpenTSDB is a distributed, scalable Time Series Database (TSDB) written on top of HBase. OpenTSDB was written to address a common need: store, index and serve metrics collected from computer systems (network gear, operating systems, applications) at a large scale, and make this data easily accessible and graphable.
There is a newer version: 2.4.1
Show newest version
// This file is part of OpenTSDB.
// Copyright (C) 2015  The OpenTSDB Authors.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 2.1 of the License, or (at your
// option) any later version.  This program is distributed in the hope that it
// will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
// General Public License for more details.  You should have received a copy
// of the GNU Lesser General Public License along with this program.  If not,
// see .
package net.opentsdb.core;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;

import net.opentsdb.meta.Annotation;
import net.opentsdb.query.filter.TagVFilter;
import net.opentsdb.rollup.RollupQuery;
import net.opentsdb.rollup.RollupSpan;
import net.opentsdb.stats.QueryStats;
import net.opentsdb.stats.QueryStats.QueryStat;
import net.opentsdb.uid.UniqueId;
import net.opentsdb.utils.DateTime;
import net.opentsdb.utils.JSON;

import org.hbase.async.Bytes.ByteMap;
import org.hbase.async.Bytes;
import org.hbase.async.DeleteRequest;
import org.hbase.async.KeyValue;
import org.hbase.async.Scanner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.stumbleupon.async.Callback;
import com.stumbleupon.async.Deferred;

/**
 * A class that handles coordinating the various scanners created for each 
 * salt bucket when salting is enabled. Each scanner stores it's results in 
 * local maps and once everyone has reported in, then the maps are parsed and
 * combined into a proper set of spans to return to the {@link TsdbQuery} class.
 * 
 * Note that if one or more of the scanners throws an exception, then that 
 * exception will be returned to the caller in the deferred. Unfortunately we
 * don't have a good way to cancel a scan in progress so the first scanner with
 * an error will store it, then we wait for all of the other scanners to
 * complete.
 * 
 * Concurrency is important in this class as the scanners are executing
 * asynchronously and can modify variables at any time.
 */
public class SaltScanner {
  private static final Logger LOG = LoggerFactory.getLogger(SaltScanner.class);
  
  /** This is a map that the caller must supply. We'll fill it with data.
   * WARNING: The salted row comparator should be applied to this map. */
  private final TreeMap spans;
  
  /** The list of pre-configured scanners. One scanner should be created per
   * salt bucket. */
  private final List scanners;
  
  /** Stores the compacted columns from each scanner as it completes. After all
   * scanners are done, we process this into the span map above. */
  private final Map> kv_map = 
          new ConcurrentHashMap>();
  
  /** Stores annotations from each scanner as it completes */
  private final Map> annotation_map = 
          Collections.synchronizedMap(
              new TreeMap>(new RowKey.SaltCmp()));
  
  /** A deferred to call with the spans on completion */
  private final Deferred> results = 
          new Deferred>();
  
  /** The metric this scanner set is dealing with. If a row comes in with a 
   * different metric we toss an exception. This shouldn't happen though. */
  private final byte[] metric;
  
  /** The TSDB to which we belong */
  private final TSDB tsdb;
  
  /** A stats object associated with the sub query used for storing stats
   * about scanner operations. */
  private final QueryStats query_stats;
  
  /** Index of the sub query in the main query list */
  private final int query_index;
  
  /** A counter used to determine how many scanners are still running */
  private AtomicInteger completed_tasks = new AtomicInteger();
  
  /** When the scanning started. We store the scan latency once all scanners
   * are done.*/
  private long start_time; // milliseconds.

  /** Whether or not to delete the queried data */
  private final boolean delete;
  
  /** A rollup query configuration if scanning for rolled up data. */
  private final RollupQuery rollup_query;
  
  /** A list of filters to iterate over when processing rows */
  private final List filters;
  
  /** A holder for storing the first exception thrown by a scanner if something
   * goes pear shaped. Make sure to synchronize on this object when checking
   * for null or assigning from a scanner's callback. */
  private volatile Exception exception;
  
  /**
   * Default ctor that performs some validation. Call {@link scan} after 
   * construction to actually start fetching data.
   * @param tsdb The TSDB to which we belong
   * @param metric The metric we're expecting to fetch
   * @param scanners A list of HBase scanners, one for each bucket
   * @param spans The span map to store results in
   * @param filters A list of filters for processing
   * @throws IllegalArgumentException if any required data was missing or
   * we had invalid parameters.
   */
  public SaltScanner(final TSDB tsdb, final byte[] metric, 
                                      final List scanners, 
                                      final TreeMap spans,
                                      final List filters) {
    this(tsdb, metric, scanners, spans, filters, false, null, null, 0);
  }
  
  /**
   * Default ctor that performs some validation. Call {@link scan} after 
   * construction to actually start fetching data.
   * @param tsdb The TSDB to which we belong
   * @param metric The metric we're expecting to fetch
   * @param scanners A list of HBase scanners, one for each bucket
   * @param spans The span map to store results in
   * @param delete Whether or not to delete the queried data
   * @param rollup_query An optional rollup query config. May be null.
   * @param filters A list of filters for processing
   * @param query_stats A stats object for tracking timing
   * @param query_index The index of the sub query in the main query list
   * @throws IllegalArgumentException if any required data was missing or
   * we had invalid parameters.
   */
  public SaltScanner(final TSDB tsdb, final byte[] metric, 
                                      final List scanners, 
                                      final TreeMap spans,
                                      final List filters,
                                      final boolean delete,
                                      final RollupQuery rollup_query,
                                      final QueryStats query_stats,
                                      final int query_index) {
    if (Const.SALT_WIDTH() < 1) {
      throw new IllegalArgumentException(
          "Salting is disabled. Use the regular scanner");
    }
    if (tsdb == null) {
      throw new IllegalArgumentException("The TSDB argument was null.");
    }
    if (spans == null) {
      throw new IllegalArgumentException("Span map cannot be null.");
    }
    if (!spans.isEmpty()) {
      throw new IllegalArgumentException("The span map should be empty.");
    }
    if (scanners == null || scanners.isEmpty()) {
      throw new IllegalArgumentException("Missing or empty scanners list. "
          + "Please provide a list of scanners for each salt.");
    }
    if (scanners.size() != Const.SALT_BUCKETS()) {
      throw new IllegalArgumentException("Not enough or too many scanners " + 
          scanners.size() + " when the salt bucket count is " + 
          Const.SALT_BUCKETS());
    }
    if (metric == null) {
      throw new IllegalArgumentException("The metric array was null.");
    }
    if (metric.length != TSDB.metrics_width()) {
      throw new IllegalArgumentException("The metric was too short. It must be " 
          + TSDB.metrics_width() + "bytes wide.");
    }
    
    this.scanners = scanners;
    this.spans = spans;
    this.metric = metric;
    this.tsdb = tsdb;
    this.filters = filters;
    this.delete = delete;
    this.rollup_query = rollup_query;
    this.query_stats = query_stats;
    this.query_index = query_index;
  }

  /**
   * Starts all of the scanners asynchronously and returns the data fetched
   * once all of the scanners have completed. Note that the result may be an
   * exception if one or more of the scanners encountered an exception. The 
   * first error will be returned, others will be logged. 
   * @return A deferred to wait on for results.
   */
  public Deferred> scan() {
    start_time = System.currentTimeMillis();
    int i = 0;
    for (final Scanner scanner: scanners) {
      new ScannerCB(scanner, i++).scan();
    }
    return results; 
  }

  /**
   * Called once all of the scanners have reported back in to record our
   * latency and merge the results into the spans map. If there was an exception
   * stored then we'll return that instead.
   */
  private void mergeAndReturnResults() {
    final long hbase_time = System.currentTimeMillis();
    TsdbQuery.scanlatency.add((int)(hbase_time - start_time));
    long rows = 0;

    if (exception != null) {
      LOG.error("After all of the scanners finished, at "
          + "least one threw an exception", exception);
      results.callback(exception);
      return;
    }
    
    // Merge sorted spans together
    final long merge_start = DateTime.nanoTime();
    for (final List kvs : kv_map.values()) {
      if (kvs == null || kvs.isEmpty()) {
        LOG.warn("Found a key value list that was null or empty");
        continue;
      }
      
      for (final KeyValue kv : kvs) {
        if (kv == null) {
          LOG.warn("Found a key value item that was null");
          continue;
        }
        if (kv.key() == null) {
          LOG.warn("A key for a kv was null");
          continue;
        }

        Span datapoints = spans.get(kv.key());
        if (datapoints == null) {
          datapoints = RollupQuery.isValidQuery(rollup_query) ?
              new RollupSpan(tsdb, this.rollup_query) : new Span(tsdb);
          spans.put(kv.key(), datapoints);
        }

        if (annotation_map.containsKey(kv.key())) {
          for (final Annotation note: annotation_map.get(kv.key())) {
            datapoints.getAnnotations().add(note);
          }
          annotation_map.remove(kv.key());
        }
        try {  
          datapoints.addRow(kv);
          rows++;
        } catch (RuntimeException e) {
          LOG.error("Exception adding row to span", e);
          throw e;
        }
      }
    }
     
    kv_map.clear();

    for (final byte[] key : annotation_map.keySet()) {
      Span datapoints = spans.get(key);
      if (datapoints == null) {
        datapoints = new Span(tsdb);
        spans.put(key, datapoints);
      }

      for (final Annotation note: annotation_map.get(key)) {
        datapoints.getAnnotations().add(note);
      }
    }

    if (query_stats != null) {
      query_stats.addStat(query_index, QueryStat.SCANNER_MERGE_TIME, 
          (DateTime.nanoTime() - merge_start));
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Scanning completed in " + (hbase_time - start_time) + " ms, " +
            rows + " rows, and stored in " + spans.size() + " spans");
      LOG.debug("It took " + (System.currentTimeMillis() - hbase_time) + " ms, "
            + " to merge and sort the rows into a tree map");
    }

    results.callback(spans);
  }

  /**
  * Scanner callback executed recursively each time we get a set of data
  * from storage. This is responsible for determining what columns are
  * returned and issuing requests to load leaf objects.
  * When the scanner returns a null set of rows, the method initiates the
  * final callback.
  */
  final class ScannerCB implements Callback>> {
    private final Scanner scanner;
    private final int index;
    private final List kvs = new ArrayList();
    private final ByteMap> annotations = 
            new ByteMap>();
    private final Set skips = Collections.newSetFromMap(
        new ConcurrentHashMap());
    private final Set keepers = Collections.newSetFromMap(
        new ConcurrentHashMap());
    
    private long scanner_start = -1;
    /** nanosecond timestamps */
    private long fetch_start = 0;      // reset each time we send an RPC to HBase
    private long fetch_time = 0;       // cumulation of time waiting on HBase
    private long uid_resolve_time = 0; // cumulation of time resolving UIDs
    private long uids_resolved = 0; 
    private long compaction_time = 0;  // cumulation of time compacting
    private long dps_pre_filter = 0;
    private long rows_pre_filter = 0;
    private long dps_post_filter = 0;
    private long rows_post_filter = 0;
    
    public ScannerCB(final Scanner scanner, final int index) {
      this.scanner = scanner;
      this.index = index;
      if (query_stats != null) {
        query_stats.addScannerId(query_index, index, scanner.toString());
      }
    }
    
    /** Error callback that will capture an exception from AsyncHBase and store
     * it so we can bubble it up to the caller.
     */
    class ErrorCb implements Callback {
      @Override
      public Object call(final Exception e) throws Exception {
        LOG.error("Scanner " + scanner + " threw an exception", e);
        close(false);
        handleException(e);
        return null;
      }
    }
    
    /**
    * Starts the scanner and is called recursively to fetch the next set of
    * rows from the scanner.
    * @return The map of spans if loaded successfully, null if no data was
    * found
    */
    public Object scan() {
      if (scanner_start < 0) {
        scanner_start = DateTime.nanoTime();
      }
      fetch_start = DateTime.nanoTime();
      return scanner.nextRows().addCallback(this).addErrback(new ErrorCb());
    }

    /**
    * Iterate through each row of the scanner results, parses out data
    * points (and optional meta data).
    * @return null if no rows were found, otherwise the TreeMap with spans
    */
    @Override
    public Object call(final ArrayList> rows) 
            throws Exception {
      try {
        fetch_time += DateTime.nanoTime() - fetch_start;
        if (rows == null) {
          close(true);
          return null;
        } else if (exception != null) {
          close(false);
          // don't need to handleException here as it's already taken care of
          // due to the fact that exception was set.
          if (LOG.isDebugEnabled()) {
            LOG.debug("Closing scanner as there was an exception: " + scanner);
          }
          return null;
        }

        // used for UID resolution if a filter is involved
        final List> lookups = 
            filters != null && !filters.isEmpty() ? 
                new ArrayList>(rows.size()) : null;
        
        rows_pre_filter += rows.size();
        for (final ArrayList row : rows) {
          final byte[] key = row.get(0).key();
          if (RowKey.rowKeyContainsMetric(metric, key) != 0) {
            close(false);
            handleException(new IllegalDataException(
                   "HBase returned a row that doesn't match"
                   + " our scanner (" + scanner + ")! " + row + " does not start"
                   + " with " + Arrays.toString(metric) + " on scanner " + this));
            return null;
          }

          // calculate estimated data point count. We don't want to deserialize
          // the byte arrays so we'll just get a rough estimate of compacted
          // columns.
          for (final KeyValue kv : row) {
            if (kv.qualifier().length % 2 == 0) {
              if (kv.qualifier().length == 2 || kv.qualifier().length == 4) {
                ++dps_pre_filter;
              } else {
                // for now we'll assume that all compacted columns are of the 
                // same precision. This is likely incorrect.
                if (Internal.inMilliseconds(kv.qualifier())) {
                  dps_pre_filter += (kv.qualifier().length / 4);
                } else {
                  dps_pre_filter += (kv.qualifier().length / 2);
                }
              }
            } else if (kv.qualifier()[0] == AppendDataPoints.APPEND_COLUMN_PREFIX) {
              // with appends we don't have a good rough estimate as the length
              // can vary widely with the value length variability. Therefore we
              // have to iterate.
              int idx = 0;
              int qlength = 0;
              while (idx < kv.value().length) {
                qlength = Internal.getQualifierLength(kv.value(), idx);
                idx += qlength + Internal.getValueLengthFromQualifier(kv.value(), idx);
                ++dps_pre_filter;
              }
            }
          }
          
          // If any filters have made it this far then we need to resolve
          // the row key UIDs to their names for string comparison. We'll
          // try to avoid the resolution with some sets but we may dupe
          // resolve a few times.
          // TODO - more efficient resolution
          // TODO - byte set instead of a string for the uid may be faster
          if (filters != null && !filters.isEmpty()) {
            lookups.clear();
            final String tsuid = 
                UniqueId.uidToString(UniqueId.getTSUIDFromKey(key, 
                TSDB.metrics_width(), Const.TIMESTAMP_BYTES));
            if (skips.contains(tsuid)) {
              continue;
            }
            if (!keepers.contains(tsuid)) {
              final long uid_start = DateTime.nanoTime();
              
              /** CB to called after all of the UIDs have been resolved */
              class MatchCB implements Callback> {
                @Override
                public Object call(final ArrayList matches) 
                    throws Exception {
                  for (final boolean matched : matches) {
                    if (!matched) {
                      skips.add(tsuid);
                      return null;
                    }
                  }
                  // matched all, good data
                  keepers.add(tsuid);
                  processRow(key, row);
                  return null;
                }
              }

              /** Resolves all of the row key UIDs to their strings for filtering */
              class GetTagsCB implements
                  Callback>, Map> {
                @Override
                public Deferred> call(
                    final Map tags) throws Exception {
                  uid_resolve_time += (DateTime.nanoTime() - uid_start);
                  uids_resolved += tags.size();
                  final List> matches =
                      new ArrayList>(filters.size());

                  for (final TagVFilter filter : filters) {
                    matches.add(filter.match(tags));
                  }
                  
                  return Deferred.group(matches);
                }
              }
 
              lookups.add(Tags.getTagsAsync(tsdb, key)
                  .addCallbackDeferring(new GetTagsCB())
                  .addBoth(new MatchCB()));
            } else {
              processRow(key, row);
            }
          } else {
            processRow(key, row);
          }
        }
           
        // either we need to wait on the UID resolutions or we can go ahead
        // if we don't have filters.
        if (lookups != null && lookups.size() > 0) {
          class GroupCB implements Callback> {
            @Override
            public Object call(final ArrayList