All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.accumulo.manager.TabletGroupWatcher Maven / Gradle / Ivy

Go to download

The manager server for Apache Accumulo for load balancing and other system-wide operations.

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.accumulo.manager;

import static java.lang.Math.min;
import static org.apache.accumulo.core.util.UtilWaitThread.sleepUninterruptibly;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;

import org.apache.accumulo.core.client.AccumuloClient;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.client.RowIterator;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.PartialKey;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.core.gc.ReferenceFile;
import org.apache.accumulo.core.logging.TabletLogger;
import org.apache.accumulo.core.manager.state.tables.TableState;
import org.apache.accumulo.core.manager.thrift.ManagerState;
import org.apache.accumulo.core.master.thrift.TabletServerStatus;
import org.apache.accumulo.core.metadata.MetadataTable;
import org.apache.accumulo.core.metadata.RootTable;
import org.apache.accumulo.core.metadata.StoredTabletFile;
import org.apache.accumulo.core.metadata.TServerInstance;
import org.apache.accumulo.core.metadata.TabletLocationState;
import org.apache.accumulo.core.metadata.TabletLocationState.BadLocationStateException;
import org.apache.accumulo.core.metadata.TabletState;
import org.apache.accumulo.core.metadata.schema.Ample;
import org.apache.accumulo.core.metadata.schema.Ample.DataLevel;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ChoppedColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.CurrentLocationColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.DataFileColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ExternalCompactionColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.FutureLocationColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ServerColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.TabletColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataTime;
import org.apache.accumulo.core.metadata.schema.TabletMetadata.Location;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException;
import org.apache.accumulo.core.util.threads.Threads.AccumuloDaemonThread;
import org.apache.accumulo.manager.Manager.TabletGoalState;
import org.apache.accumulo.manager.state.MergeStats;
import org.apache.accumulo.manager.state.TableCounts;
import org.apache.accumulo.manager.state.TableStats;
import org.apache.accumulo.server.ServerContext;
import org.apache.accumulo.server.conf.TableConfiguration;
import org.apache.accumulo.server.gc.AllVolumesDirectory;
import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.log.WalStateManager.WalMarkerException;
import org.apache.accumulo.server.manager.LiveTServerSet.TServerConnection;
import org.apache.accumulo.server.manager.state.Assignment;
import org.apache.accumulo.server.manager.state.ClosableIterator;
import org.apache.accumulo.server.manager.state.DistributedStoreException;
import org.apache.accumulo.server.manager.state.MergeInfo;
import org.apache.accumulo.server.manager.state.MergeState;
import org.apache.accumulo.server.manager.state.TabletStateStore;
import org.apache.accumulo.server.manager.state.UnassignedTablet;
import org.apache.accumulo.server.tablets.TabletTime;
import org.apache.accumulo.server.util.MetadataTableUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.thrift.TException;

import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Iterators;

abstract class TabletGroupWatcher extends AccumuloDaemonThread {
  // Constants used to make sure assignment logging isn't excessive in quantity or size

  private final Manager manager;
  private final TabletStateStore store;
  private final TabletGroupWatcher dependentWatcher;
  final TableStats stats = new TableStats();
  private SortedSet lastScanServers = Collections.emptySortedSet();

  TabletGroupWatcher(Manager manager, TabletStateStore store, TabletGroupWatcher dependentWatcher) {
    super("Watching " + store.name());
    this.manager = manager;
    this.store = store;
    this.dependentWatcher = dependentWatcher;
  }

  /** Should this {@code TabletGroupWatcher} suspend tablets? */
  abstract boolean canSuspendTablets();

  Map getStats() {
    return stats.getLast();
  }

  TableCounts getStats(TableId tableId) {
    return stats.getLast(tableId);
  }

  /**
   * True if the collection of live tservers specified in 'candidates' hasn't changed since the last
   * time an assignment scan was started.
   */
  synchronized boolean isSameTserversAsLastScan(Set candidates) {
    return candidates.equals(lastScanServers);
  }

  /**
   * Collection of data structures used to track Tablet assignments
   */
  private static class TabletLists {
    private final List assignments = new ArrayList<>();
    private final List assigned = new ArrayList<>();
    private final List assignedToDeadServers = new ArrayList<>();
    private final List suspendedToGoneServers = new ArrayList<>();
    private final Map unassigned = new HashMap<>();
    private final Map> logsForDeadServers = new TreeMap<>();
    // read only list of tablet servers that are not shutting down
    private final SortedMap destinations;

    public TabletLists(Manager m, SortedMap curTServers) {
      var destinationsMod = new TreeMap<>(curTServers);
      destinationsMod.keySet().removeAll(m.serversToShutdown);
      this.destinations = Collections.unmodifiableSortedMap(destinationsMod);
    }

    public void reset() {
      assignments.clear();
      assigned.clear();
      assignedToDeadServers.clear();
      suspendedToGoneServers.clear();
      unassigned.clear();
    }
  }

  @Override
  public void run() {
    int[] oldCounts = new int[TabletState.values().length];
    EventCoordinator.Listener eventListener = this.manager.nextEvent.getListener();

    WalStateManager wals = new WalStateManager(manager.getContext());

    while (manager.stillManager()) {
      // slow things down a little, otherwise we spam the logs when there are many wake-up events
      sleepUninterruptibly(100, TimeUnit.MILLISECONDS);

      final long waitTimeBetweenScans = manager.getConfiguration()
          .getTimeInMillis(Property.MANAGER_TABLET_GROUP_WATCHER_INTERVAL);

      int totalUnloaded = 0;
      int unloaded = 0;
      ClosableIterator iter = null;
      try {
        Map mergeStatsCache = new HashMap<>();
        Map currentMerges = new HashMap<>();
        for (MergeInfo merge : manager.merges()) {
          if (merge.getExtent() != null) {
            currentMerges.put(merge.getExtent().tableId(), new MergeStats(merge));
          }
        }

        // Get the current status for the current list of tservers
        SortedMap currentTServers = new TreeMap<>();
        for (TServerInstance entry : manager.tserverSet.getCurrentServers()) {
          currentTServers.put(entry, manager.tserverStatus.get(entry));
        }

        if (currentTServers.isEmpty()) {
          eventListener.waitForEvents(waitTimeBetweenScans);
          synchronized (this) {
            lastScanServers = Collections.emptySortedSet();
          }
          continue;
        }

        TabletLists tLists = new TabletLists(manager, currentTServers);

        ManagerState managerState = manager.getManagerState();
        int[] counts = new int[TabletState.values().length];
        stats.begin();
        // Walk through the tablets in our store, and work tablets
        // towards their goal
        iter = store.iterator();
        while (iter.hasNext()) {
          TabletLocationState tls = iter.next();
          if (tls == null) {
            continue;
          }

          // ignore entries for tables that do not exist in zookeeper
          if (manager.getTableManager().getTableState(tls.extent.tableId()) == null) {
            continue;
          }

          // Don't overwhelm the tablet servers with work
          if (tLists.unassigned.size() + unloaded
              > Manager.MAX_TSERVER_WORK_CHUNK * currentTServers.size()) {
            flushChanges(tLists, wals);
            tLists.reset();
            unloaded = 0;
            eventListener.waitForEvents(waitTimeBetweenScans);
          }
          TableId tableId = tls.extent.tableId();
          TableConfiguration tableConf = manager.getContext().getTableConfiguration(tableId);

          MergeStats mergeStats = mergeStatsCache.computeIfAbsent(tableId, k -> {
            var mStats = currentMerges.get(k);
            return mStats != null ? mStats : new MergeStats(new MergeInfo());
          });
          TabletGoalState goal = manager.getGoalState(tls, mergeStats.getMergeInfo());
          Location location = tls.getLocation();
          TabletState state = tls.getState(currentTServers.keySet());

          TabletLogger.missassigned(tls.extent, goal.toString(), state.toString(),
              tls.getFutureServer(), tls.getCurrentServer(), tls.walogs.size());

          stats.update(tableId, state);
          mergeStats.update(tls.extent, state, tls.chopped, !tls.walogs.isEmpty());
          sendChopRequest(mergeStats.getMergeInfo(), state, tls);
          sendSplitRequest(mergeStats.getMergeInfo(), state, tls);

          // Always follow through with assignments
          if (state == TabletState.ASSIGNED) {
            goal = TabletGoalState.HOSTED;
          }
          if (Manager.log.isTraceEnabled()) {
            Manager.log.trace(
                "[{}] Shutting down all Tservers: {}, dependentCount: {} Extent: {}, state: {}, goal: {}",
                store.name(), manager.serversToShutdown.equals(currentTServers.keySet()),
                dependentWatcher == null ? "null" : dependentWatcher.assignedOrHosted(), tls.extent,
                state, goal);
          }

          // if we are shutting down all the tabletservers, we have to do it in order
          if ((goal == TabletGoalState.SUSPENDED && state == TabletState.HOSTED)
              && manager.serversToShutdown.equals(currentTServers.keySet())) {
            if (dependentWatcher != null) {
              // If the dependentWatcher is for the user tables, check to see
              // that user tables exist.
              DataLevel dependentLevel = dependentWatcher.store.getLevel();
              boolean userTablesExist = true;
              switch (dependentLevel) {
                case USER:
                  Set onlineTables = manager.onlineTables();
                  onlineTables.remove(RootTable.ID);
                  onlineTables.remove(MetadataTable.ID);
                  userTablesExist = !onlineTables.isEmpty();
                  break;
                case METADATA:
                case ROOT:
                default:
                  break;
              }
              // If the stats object in the dependentWatcher is empty, then it
              // currently does not have data about what is hosted or not. In
              // that case host these tablets until the dependent watcher can
              // gather some data.
              final Map stats = dependentWatcher.getStats();
              if (dependentLevel == DataLevel.USER) {
                if (userTablesExist
                    && (stats == null || stats.isEmpty() || assignedOrHosted(stats) > 0)) {
                  goal = TabletGoalState.HOSTED;
                }
              } else if (stats == null || stats.isEmpty() || assignedOrHosted(stats) > 0) {
                goal = TabletGoalState.HOSTED;
              }
            }
          }

          if (goal == TabletGoalState.HOSTED) {
            if ((state != TabletState.HOSTED && !tls.walogs.isEmpty())
                && manager.recoveryManager.recoverLogs(tls.extent, tls.walogs)) {
              continue;
            }
            switch (state) {
              case HOSTED:
                if (location.getServerInstance().equals(manager.migrations.get(tls.extent))) {
                  manager.migrations.remove(tls.extent);
                }
                break;
              case ASSIGNED_TO_DEAD_SERVER:
                hostDeadTablet(tLists, tls, location, wals);
                break;
              case SUSPENDED:
                hostSuspendedTablet(tLists, tls, location, tableConf);
                break;
              case UNASSIGNED:
                hostUnassignedTablet(tLists, tls.extent, new UnassignedTablet(location, tls.last));
                break;
              case ASSIGNED:
                // Send another reminder
                tLists.assigned.add(new Assignment(tls.extent, tls.getFutureServer(), tls.last));
                break;
            }
          } else {
            switch (state) {
              case SUSPENDED:
                // Request a move to UNASSIGNED, so as to allow balancing to continue.
                tLists.suspendedToGoneServers.add(tls);
                cancelOfflineTableMigrations(tls.extent);
                break;
              case UNASSIGNED:
                cancelOfflineTableMigrations(tls.extent);
                break;
              case ASSIGNED_TO_DEAD_SERVER:
                unassignDeadTablet(tLists, tls, wals);
                break;
              case HOSTED:
                TServerConnection client =
                    manager.tserverSet.getConnection(location.getServerInstance());
                if (client != null) {
                  Manager.log.trace("[{}] Requesting TabletServer {} unload {} {}", store.name(),
                      location.getServerInstance(), tls.extent, goal.howUnload());
                  client.unloadTablet(manager.managerLock, tls.extent, goal.howUnload(),
                      manager.getSteadyTime());
                  unloaded++;
                  totalUnloaded++;
                } else {
                  Manager.log.warn("Could not connect to server {}", location);
                }
                break;
              case ASSIGNED:
                break;
            }
          }
          counts[state.ordinal()]++;
        }

        flushChanges(tLists, wals);

        // provide stats after flushing changes to avoid race conditions w/ delete table
        stats.end(managerState);
        Manager.log.trace("[{}] End stats collection: {}", store.name(), stats);

        // Report changes
        for (TabletState state : TabletState.values()) {
          int i = state.ordinal();
          if (counts[i] > 0 && counts[i] != oldCounts[i]) {
            manager.nextEvent.event("[%s]: %d tablets are %s", store.name(), counts[i],
                state.name());
          }
        }
        Manager.log.debug(String.format("[%s]: scan time %.2f seconds", store.name(),
            stats.getScanTime() / 1000.));
        oldCounts = counts;
        if (totalUnloaded > 0) {
          manager.nextEvent.event("[%s]: %d tablets unloaded", store.name(), totalUnloaded);
        }

        updateMergeState(mergeStatsCache);

        synchronized (this) {
          lastScanServers = ImmutableSortedSet.copyOf(currentTServers.keySet());
        }
        if (manager.tserverSet.getCurrentServers().equals(currentTServers.keySet())) {
          Manager.log.debug(String.format("[%s] sleeping for %.2f seconds", store.name(),
              waitTimeBetweenScans / 1000.));
          eventListener.waitForEvents(waitTimeBetweenScans);
        } else {
          Manager.log.info("Detected change in current tserver set, re-running state machine.");
        }
      } catch (Exception ex) {
        Manager.log.error("Error processing table state for store " + store.name(), ex);
        if (ex.getCause() != null && ex.getCause() instanceof BadLocationStateException) {
          repairMetadata(((BadLocationStateException) ex.getCause()).getEncodedEndRow());
        } else {
          sleepUninterruptibly(Manager.WAIT_BETWEEN_ERRORS, TimeUnit.MILLISECONDS);
        }
      } finally {
        if (iter != null) {
          try {
            iter.close();
          } catch (IOException ex) {
            Manager.log.warn("Error closing TabletLocationState iterator: " + ex, ex);
          }
        }
      }
    }
  }

  private void unassignDeadTablet(TabletLists tLists, TabletLocationState tls, WalStateManager wals)
      throws WalMarkerException {
    tLists.assignedToDeadServers.add(tls);
    if (!tLists.logsForDeadServers.containsKey(tls.futureOrCurrentServer())) {
      tLists.logsForDeadServers.put(tls.futureOrCurrentServer(),
          wals.getWalsInUse(tls.futureOrCurrentServer()));
    }
  }

  private void hostUnassignedTablet(TabletLists tLists, KeyExtent tablet,
      UnassignedTablet unassignedTablet) {
    // maybe it's a finishing migration
    TServerInstance dest = manager.migrations.get(tablet);
    if (dest != null) {
      // if destination is still good, assign it
      if (tLists.destinations.containsKey(dest)) {
        tLists.assignments.add(new Assignment(tablet, dest, unassignedTablet.getLastLocation()));
      } else {
        // get rid of this migration
        manager.migrations.remove(tablet);
        tLists.unassigned.put(tablet, unassignedTablet);
      }
    } else {
      tLists.unassigned.put(tablet, unassignedTablet);
    }
  }

  private void hostSuspendedTablet(TabletLists tLists, TabletLocationState tls, Location location,
      TableConfiguration tableConf) {
    if (manager.getSteadyTime() - tls.suspend.suspensionTime
        < tableConf.getTimeInMillis(Property.TABLE_SUSPEND_DURATION)) {
      // Tablet is suspended. See if its tablet server is back.
      TServerInstance returnInstance = null;
      Iterator find = tLists.destinations
          .tailMap(new TServerInstance(tls.suspend.server, " ")).keySet().iterator();
      if (find.hasNext()) {
        TServerInstance found = find.next();
        if (found.getHostAndPort().equals(tls.suspend.server)) {
          returnInstance = found;
        }
      }

      // Old tablet server is back. Return this tablet to its previous owner.
      if (returnInstance != null) {
        tLists.assignments.add(new Assignment(tls.extent, returnInstance, tls.last));
      }
      // else - tablet server not back. Don't ask for a new assignment right now.

    } else {
      // Treat as unassigned, ask for a new assignment.
      tLists.unassigned.put(tls.extent, new UnassignedTablet(location, tls.last));
    }
  }

  private void hostDeadTablet(TabletLists tLists, TabletLocationState tls, Location location,
      WalStateManager wals) throws WalMarkerException {
    tLists.assignedToDeadServers.add(tls);
    if (location.getServerInstance().equals(manager.migrations.get(tls.extent))) {
      manager.migrations.remove(tls.extent);
    }
    TServerInstance tserver = tls.futureOrCurrentServer();
    if (!tLists.logsForDeadServers.containsKey(tserver)) {
      tLists.logsForDeadServers.put(tserver, wals.getWalsInUse(tserver));
    }
  }

  private void cancelOfflineTableMigrations(KeyExtent extent) {
    TServerInstance dest = manager.migrations.get(extent);
    TableState tableState = manager.getTableManager().getTableState(extent.tableId());
    if (dest != null && tableState == TableState.OFFLINE) {
      manager.migrations.remove(extent);
    }
  }

  private void repairMetadata(Text row) {
    Manager.log.debug("Attempting repair on {}", row);
    // ACCUMULO-2261 if a dying tserver writes a location before its lock information propagates, it
    // may cause duplicate assignment.
    // Attempt to find the dead server entry and remove it.
    try {
      Map future = new HashMap<>();
      Map assigned = new HashMap<>();
      KeyExtent extent = KeyExtent.fromMetaRow(row);
      String table = MetadataTable.NAME;
      if (extent.isMeta()) {
        table = RootTable.NAME;
      }
      Scanner scanner = manager.getContext().createScanner(table, Authorizations.EMPTY);
      scanner.fetchColumnFamily(CurrentLocationColumnFamily.NAME);
      scanner.fetchColumnFamily(FutureLocationColumnFamily.NAME);
      scanner.setRange(new Range(row));
      for (Entry entry : scanner) {
        if (entry.getKey().getColumnFamily().equals(CurrentLocationColumnFamily.NAME)) {
          assigned.put(entry.getKey(), entry.getValue());
        } else if (entry.getKey().getColumnFamily().equals(FutureLocationColumnFamily.NAME)) {
          future.put(entry.getKey(), entry.getValue());
        }
      }
      if (!future.isEmpty() && !assigned.isEmpty()) {
        Manager.log.warn("Found a tablet assigned and hosted, attempting to repair");
      } else if (future.size() > 1 && assigned.isEmpty()) {
        Manager.log.warn("Found a tablet assigned to multiple servers, attempting to repair");
      } else if (future.isEmpty() && assigned.size() > 1) {
        Manager.log.warn("Found a tablet hosted on multiple servers, attempting to repair");
      } else {
        Manager.log.info("Attempted a repair, but nothing seems to be obviously wrong. {} {}",
            assigned, future);
        return;
      }
      Iterator> iter =
          Iterators.concat(future.entrySet().iterator(), assigned.entrySet().iterator());
      while (iter.hasNext()) {
        Entry entry = iter.next();
        TServerInstance alive = manager.tserverSet.find(entry.getValue().toString());
        if (alive == null) {
          Manager.log.info("Removing entry  {}", entry);
          BatchWriter bw = manager.getContext().createBatchWriter(table);
          Mutation m = new Mutation(entry.getKey().getRow());
          m.putDelete(entry.getKey().getColumnFamily(), entry.getKey().getColumnQualifier());
          bw.addMutation(m);
          bw.close();
          return;
        }
      }
      Manager.log.error(
          "Metadata table is inconsistent at {} and all assigned/future tservers are still online.",
          row);
    } catch (Exception e) {
      Manager.log.error("Error attempting repair of metadata " + row + ": " + e, e);
    }
  }

  private int assignedOrHosted() {
    return assignedOrHosted(stats.getLast());
  }

  private int assignedOrHosted(Map last) {
    int result = 0;
    for (TableCounts counts : last.values()) {
      result += counts.assigned() + counts.hosted();
    }
    return result;
  }

  private void sendSplitRequest(MergeInfo info, TabletState state, TabletLocationState tls) {
    // Already split?
    if (!info.getState().equals(MergeState.SPLITTING)) {
      return;
    }
    // Merges don't split
    if (!info.isDelete()) {
      return;
    }
    // Online and ready to split?
    if (!state.equals(TabletState.HOSTED)) {
      return;
    }
    // Does this extent cover the end points of the delete?
    KeyExtent range = info.getExtent();
    if (tls.extent.overlaps(range)) {
      for (Text splitPoint : new Text[] {range.prevEndRow(), range.endRow()}) {
        if (splitPoint == null) {
          continue;
        }
        if (!tls.extent.contains(splitPoint)) {
          continue;
        }
        if (splitPoint.equals(tls.extent.endRow())) {
          continue;
        }
        if (splitPoint.equals(tls.extent.prevEndRow())) {
          continue;
        }
        try {
          TServerConnection conn;
          conn = manager.tserverSet.getConnection(tls.getCurrentServer());
          if (conn != null) {
            Manager.log.info("Asking {} to split {} at {}", tls.current, tls.extent, splitPoint);
            conn.splitTablet(tls.extent, splitPoint);
          } else {
            Manager.log.warn("Not connected to server {}", tls.current);
          }
        } catch (NotServingTabletException e) {
          Manager.log.debug("Error asking tablet server to split a tablet: ", e);
        } catch (Exception e) {
          Manager.log.warn("Error asking tablet server to split a tablet: ", e);
        }
      }
    }
  }

  private void sendChopRequest(MergeInfo info, TabletState state, TabletLocationState tls) {
    // Don't bother if we're in the wrong state
    if (!info.getState().equals(MergeState.WAITING_FOR_CHOPPED)) {
      return;
    }
    // Tablet must be online
    if (!state.equals(TabletState.HOSTED)) {
      return;
    }
    // Tablet isn't already chopped
    if (tls.chopped) {
      return;
    }
    // Tablet ranges intersect
    if (info.needsToBeChopped(tls.extent)) {
      TServerConnection conn;
      try {
        conn = manager.tserverSet.getConnection(tls.getCurrentServer());
        if (conn != null) {
          Manager.log.info("Asking {} to chop {}", tls.current, tls.extent);
          conn.chop(manager.managerLock, tls.extent);
        } else {
          Manager.log.warn("Could not connect to server {}", tls.current);
        }
      } catch (TException e) {
        Manager.log.warn("Communications error asking tablet server to chop a tablet");
      }
    }
  }

  private void updateMergeState(Map mergeStatsCache) {
    for (MergeStats stats : mergeStatsCache.values()) {
      try {
        MergeState update = stats.nextMergeState(manager.getContext(), manager);
        // when next state is MERGING, its important to persist this before
        // starting the merge... the verification check that is done before
        // moving into the merging state could fail if merge starts but does
        // not finish
        if (update == MergeState.COMPLETE) {
          update = MergeState.NONE;
        }
        if (update != stats.getMergeInfo().getState()) {
          manager.setMergeState(stats.getMergeInfo(), update);
        }

        if (update == MergeState.MERGING) {
          try {
            if (stats.getMergeInfo().isDelete()) {
              deleteTablets(stats.getMergeInfo());
            } else {
              mergeMetadataRecords(stats.getMergeInfo());
            }
            update = MergeState.COMPLETE;
            manager.setMergeState(stats.getMergeInfo(), update);
          } catch (Exception ex) {
            Manager.log.error("Unable merge metadata table records", ex);
          }
        }
      } catch (Exception ex) {
        Manager.log.error(
            "Unable to update merge state for merge " + stats.getMergeInfo().getExtent(), ex);
      }
    }
  }

  private void deleteTablets(MergeInfo info) throws AccumuloException {
    KeyExtent extent = info.getExtent();
    String targetSystemTable = extent.isMeta() ? RootTable.NAME : MetadataTable.NAME;
    Manager.log.debug("Deleting tablets for {}", extent);
    MetadataTime metadataTime = null;
    KeyExtent followingTablet = null;
    if (extent.endRow() != null) {
      Key nextExtent = new Key(extent.endRow()).followingKey(PartialKey.ROW);
      followingTablet =
          getHighTablet(new KeyExtent(extent.tableId(), nextExtent.getRow(), extent.endRow()));
      Manager.log.debug("Found following tablet {}", followingTablet);
    }
    try {
      AccumuloClient client = manager.getContext();
      ServerContext context = manager.getContext();
      Ample ample = context.getAmple();
      Text start = extent.prevEndRow();
      if (start == null) {
        start = new Text();
      }
      Manager.log.debug("Making file deletion entries for {}", extent);
      Range deleteRange = new Range(TabletsSection.encodeRow(extent.tableId(), start), false,
          TabletsSection.encodeRow(extent.tableId(), extent.endRow()), true);
      Scanner scanner = client.createScanner(targetSystemTable, Authorizations.EMPTY);
      scanner.setRange(deleteRange);
      ServerColumnFamily.DIRECTORY_COLUMN.fetch(scanner);
      ServerColumnFamily.TIME_COLUMN.fetch(scanner);
      scanner.fetchColumnFamily(DataFileColumnFamily.NAME);
      scanner.fetchColumnFamily(CurrentLocationColumnFamily.NAME);
      Set datafilesAndDirs = new TreeSet<>();
      for (Entry entry : scanner) {
        Key key = entry.getKey();
        if (key.compareColumnFamily(DataFileColumnFamily.NAME) == 0) {
          var stf = new StoredTabletFile(key.getColumnQualifierData().toString());
          datafilesAndDirs.add(new ReferenceFile(stf.getTableId(), stf.getMetaUpdateDelete()));
          if (datafilesAndDirs.size() > 1000) {
            ample.putGcFileAndDirCandidates(extent.tableId(), datafilesAndDirs);
            datafilesAndDirs.clear();
          }
        } else if (ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
          metadataTime = MetadataTime.parse(entry.getValue().toString());
        } else if (key.compareColumnFamily(CurrentLocationColumnFamily.NAME) == 0) {
          throw new IllegalStateException(
              "Tablet " + key.getRow() + " is assigned during a merge!");
        } else if (ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
          var allVolumesDirectory =
              new AllVolumesDirectory(extent.tableId(), entry.getValue().toString());
          datafilesAndDirs.add(allVolumesDirectory);
          if (datafilesAndDirs.size() > 1000) {
            ample.putGcFileAndDirCandidates(extent.tableId(), datafilesAndDirs);
            datafilesAndDirs.clear();
          }
        }
      }
      ample.putGcFileAndDirCandidates(extent.tableId(), datafilesAndDirs);
      BatchWriter bw = client.createBatchWriter(targetSystemTable);
      try {
        deleteTablets(info, deleteRange, bw, client);
      } finally {
        bw.close();
      }

      if (followingTablet != null) {
        Manager.log.debug("Updating prevRow of {} to {}", followingTablet, extent.prevEndRow());
        bw = client.createBatchWriter(targetSystemTable);
        try {
          Mutation m = new Mutation(followingTablet.toMetaRow());
          TabletColumnFamily.PREV_ROW_COLUMN.put(m,
              TabletColumnFamily.encodePrevEndRow(extent.prevEndRow()));
          ChoppedColumnFamily.CHOPPED_COLUMN.putDelete(m);
          bw.addMutation(m);
          bw.flush();
        } finally {
          bw.close();
        }
      } else {
        // Recreate the default tablet to hold the end of the table
        MetadataTableUtil.addTablet(new KeyExtent(extent.tableId(), null, extent.prevEndRow()),
            ServerColumnFamily.DEFAULT_TABLET_DIR_NAME, manager.getContext(),
            metadataTime.getType(), manager.managerLock);
      }
    } catch (RuntimeException | TableNotFoundException ex) {
      throw new AccumuloException(ex);
    }
  }

  private void mergeMetadataRecords(MergeInfo info) throws AccumuloException {
    KeyExtent range = info.getExtent();
    Manager.log.debug("Merging metadata for {}", range);
    KeyExtent stop = getHighTablet(range);
    Manager.log.debug("Highest tablet is {}", stop);
    Value firstPrevRowValue = null;
    Text stopRow = stop.toMetaRow();
    Text start = range.prevEndRow();
    if (start == null) {
      start = new Text();
    }
    Range scanRange =
        new Range(TabletsSection.encodeRow(range.tableId(), start), false, stopRow, false);
    String targetSystemTable = MetadataTable.NAME;
    if (range.isMeta()) {
      targetSystemTable = RootTable.NAME;
    }

    AccumuloClient client = manager.getContext();

    try (BatchWriter bw = client.createBatchWriter(targetSystemTable)) {
      long fileCount = 0;
      // Make file entries in highest tablet
      Scanner scanner = client.createScanner(targetSystemTable, Authorizations.EMPTY);
      scanner.setRange(scanRange);
      TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
      ServerColumnFamily.TIME_COLUMN.fetch(scanner);
      ServerColumnFamily.DIRECTORY_COLUMN.fetch(scanner);
      scanner.fetchColumnFamily(DataFileColumnFamily.NAME);
      Mutation m = new Mutation(stopRow);
      MetadataTime maxLogicalTime = null;
      for (Entry entry : scanner) {
        Key key = entry.getKey();
        Value value = entry.getValue();
        if (key.getColumnFamily().equals(DataFileColumnFamily.NAME)) {
          m.put(key.getColumnFamily(), key.getColumnQualifier(), value);
          fileCount++;
        } else if (TabletColumnFamily.PREV_ROW_COLUMN.hasColumns(key)
            && firstPrevRowValue == null) {
          Manager.log.debug("prevRow entry for lowest tablet is {}", value);
          firstPrevRowValue = new Value(value);
        } else if (ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
          maxLogicalTime =
              TabletTime.maxMetadataTime(maxLogicalTime, MetadataTime.parse(value.toString()));
        } else if (ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
          var allVolumesDir = new AllVolumesDirectory(range.tableId(), value.toString());
          bw.addMutation(manager.getContext().getAmple().createDeleteMutation(allVolumesDir));
        }
      }

      // read the logical time from the last tablet in the merge range, it is not included in
      // the loop above
      scanner = client.createScanner(targetSystemTable, Authorizations.EMPTY);
      scanner.setRange(new Range(stopRow));
      ServerColumnFamily.TIME_COLUMN.fetch(scanner);
      scanner.fetchColumnFamily(ExternalCompactionColumnFamily.NAME);
      Set extCompIds = new HashSet<>();
      for (Entry entry : scanner) {
        if (ServerColumnFamily.TIME_COLUMN.hasColumns(entry.getKey())) {
          maxLogicalTime = TabletTime.maxMetadataTime(maxLogicalTime,
              MetadataTime.parse(entry.getValue().toString()));
        } else if (ExternalCompactionColumnFamily.NAME.equals(entry.getKey().getColumnFamily())) {
          extCompIds.add(entry.getKey().getColumnQualifierData().toString());
        }
      }

      if (maxLogicalTime != null) {
        ServerColumnFamily.TIME_COLUMN.put(m, new Value(maxLogicalTime.encode()));
      }

      // delete any entries for external compactions
      extCompIds.forEach(ecid -> m.putDelete(ExternalCompactionColumnFamily.STR_NAME, ecid));

      if (!m.getUpdates().isEmpty()) {
        bw.addMutation(m);
      }

      bw.flush();

      Manager.log.debug("Moved {} files to {}", fileCount, stop);

      if (firstPrevRowValue == null) {
        Manager.log.debug("tablet already merged");
        return;
      }

      stop = new KeyExtent(stop.tableId(), stop.endRow(),
          TabletColumnFamily.decodePrevEndRow(firstPrevRowValue));
      Mutation updatePrevRow = TabletColumnFamily.createPrevRowMutation(stop);
      Manager.log.debug("Setting the prevRow for last tablet: {}", stop);
      bw.addMutation(updatePrevRow);
      bw.flush();

      deleteTablets(info, scanRange, bw, client);

      // Clean-up the last chopped marker
      var m2 = new Mutation(stopRow);
      ChoppedColumnFamily.CHOPPED_COLUMN.putDelete(m2);
      bw.addMutation(m2);
      bw.flush();

    } catch (Exception ex) {
      throw new AccumuloException(ex);
    }
  }

  private void deleteTablets(MergeInfo info, Range scanRange, BatchWriter bw, AccumuloClient client)
      throws TableNotFoundException, MutationsRejectedException {
    Scanner scanner;
    Mutation m;
    // Delete everything in the other tablets
    // group all deletes into tablet into one mutation, this makes tablets
    // either disappear entirely or not all.. this is important for the case
    // where the process terminates in the loop below...
    scanner = client.createScanner(info.getExtent().isMeta() ? RootTable.NAME : MetadataTable.NAME,
        Authorizations.EMPTY);
    Manager.log.debug("Deleting range {}", scanRange);
    scanner.setRange(scanRange);
    RowIterator rowIter = new RowIterator(scanner);
    while (rowIter.hasNext()) {
      Iterator> row = rowIter.next();
      m = null;
      while (row.hasNext()) {
        Entry entry = row.next();
        Key key = entry.getKey();

        if (m == null) {
          m = new Mutation(key.getRow());
        }

        m.putDelete(key.getColumnFamily(), key.getColumnQualifier());
        Manager.log.debug("deleting entry {}", key);
      }
      bw.addMutation(m);
    }

    bw.flush();
  }

  private KeyExtent getHighTablet(KeyExtent range) throws AccumuloException {
    try {
      AccumuloClient client = manager.getContext();
      Scanner scanner = client.createScanner(range.isMeta() ? RootTable.NAME : MetadataTable.NAME,
          Authorizations.EMPTY);
      TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
      KeyExtent start = new KeyExtent(range.tableId(), range.endRow(), null);
      scanner.setRange(new Range(start.toMetaRow(), null));
      Iterator> iterator = scanner.iterator();
      if (!iterator.hasNext()) {
        throw new AccumuloException("No last tablet for a merge " + range);
      }
      Entry entry = iterator.next();
      KeyExtent highTablet = KeyExtent.fromMetaPrevRow(entry);
      if (!highTablet.tableId().equals(range.tableId())) {
        throw new AccumuloException("No last tablet for merge " + range + " " + highTablet);
      }
      return highTablet;
    } catch (Exception ex) {
      throw new AccumuloException("Unexpected failure finding the last tablet for a merge " + range,
          ex);
    }
  }

  private void handleDeadTablets(TabletLists tLists, WalStateManager wals)
      throws WalMarkerException, DistributedStoreException {
    var deadTablets = tLists.assignedToDeadServers;
    var deadLogs = tLists.logsForDeadServers;

    if (!deadTablets.isEmpty()) {
      int maxServersToShow = min(deadTablets.size(), 100);
      Manager.log.debug("{} assigned to dead servers: {}...", deadTablets.size(),
          deadTablets.subList(0, maxServersToShow));
      Manager.log.debug("logs for dead servers: {}", deadLogs);
      if (canSuspendTablets()) {
        store.suspend(deadTablets, deadLogs, manager.getSteadyTime());
      } else {
        store.unassign(deadTablets, deadLogs);
      }
      markDeadServerLogsAsClosed(wals, deadLogs);
      manager.nextEvent.event(
          "Marked %d tablets as suspended because they don't have current servers",
          deadTablets.size());
    }
    if (!tLists.suspendedToGoneServers.isEmpty()) {
      int maxServersToShow = min(deadTablets.size(), 100);
      Manager.log.debug(deadTablets.size() + " suspended to gone servers: "
          + deadTablets.subList(0, maxServersToShow) + "...");
      store.unsuspend(tLists.suspendedToGoneServers);
    }
  }

  private void getAssignmentsFromBalancer(TabletLists tLists,
      Map unassigned) {
    if (!tLists.destinations.isEmpty()) {
      Map assignedOut = new HashMap<>();
      manager.getAssignments(tLists.destinations, unassigned, assignedOut);
      for (Entry assignment : assignedOut.entrySet()) {
        if (unassigned.containsKey(assignment.getKey())) {
          if (assignment.getValue() != null) {
            if (!tLists.destinations.containsKey(assignment.getValue())) {
              Manager.log.warn(
                  "balancer assigned {} to a tablet server that is not current {} ignoring",
                  assignment.getKey(), assignment.getValue());
              continue;
            }

            final UnassignedTablet unassignedTablet = unassigned.get(assignment.getKey());
            tLists.assignments.add(new Assignment(assignment.getKey(), assignment.getValue(),
                unassignedTablet != null ? unassignedTablet.getLastLocation() : null));
          }
        } else {
          Manager.log.warn(
              "{} load balancer assigning tablet that was not nominated for assignment {}",
              store.name(), assignment.getKey());
        }
      }

      if (!unassigned.isEmpty() && assignedOut.isEmpty()) {
        Manager.log.warn("Load balancer failed to assign any tablets");
      }
    }
  }

  private void flushChanges(TabletLists tLists, WalStateManager wals)
      throws DistributedStoreException, TException, WalMarkerException {
    var unassigned = Collections.unmodifiableMap(tLists.unassigned);

    handleDeadTablets(tLists, wals);

    getAssignmentsFromBalancer(tLists, unassigned);

    if (!tLists.assignments.isEmpty()) {
      Manager.log.info(String.format("Assigning %d tablets", tLists.assignments.size()));
      store.setFutureLocations(tLists.assignments);
    }
    tLists.assignments.addAll(tLists.assigned);
    for (Assignment a : tLists.assignments) {
      TServerConnection client = manager.tserverSet.getConnection(a.server);
      if (client != null) {
        client.assignTablet(manager.managerLock, a.tablet);
      } else {
        Manager.log.warn("Could not connect to server {}", a.server);
      }
      manager.assignedTablet(a.tablet);
    }
  }

  private static void markDeadServerLogsAsClosed(WalStateManager mgr,
      Map> logsForDeadServers) throws WalMarkerException {
    for (Entry> server : logsForDeadServers.entrySet()) {
      for (Path path : server.getValue()) {
        mgr.closeWal(server.getKey(), path);
      }
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy