org.apache.phoenix.jdbc.HighAvailabilityGroup Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phoenix-mapreduce-byo-shaded-hbase-hbase-2.5.0
Phoenix Mapreduce JAR for use with the "hbase mapredcp" classpath
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.jdbc;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.curator.RetryPolicy;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.framework.recipes.cache.NodeCache;
import org.apache.curator.retry.ExponentialBackoffRetry;
import org.apache.curator.utils.ZKPaths;
import org.apache.hadoop.hbase.util.PairOfSameType;
import org.apache.phoenix.exception.SQLExceptionCode;
import org.apache.phoenix.exception.SQLExceptionInfo;
import org.apache.phoenix.jdbc.ClusterRoleRecord.ClusterRole;
import org.apache.phoenix.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.phoenix.thirdparty.com.google.common.base.Preconditions;
import org.apache.phoenix.thirdparty.com.google.common.base.Strings;
import org.apache.phoenix.thirdparty.com.google.common.cache.Cache;
import org.apache.phoenix.thirdparty.com.google.common.cache.CacheBuilder;
import org.apache.phoenix.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.phoenix.util.JDBCUtil;
import org.apache.phoenix.util.PhoenixRuntime;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.umd.cs.findbugs.annotations.NonNull;

import java.io.IOException;
import java.sql.Connection;
import java.sql.Driver;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;

import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_CLIENT_CONNECTION_CACHE_MAX_DURATION;

/**
 * An high availability (HA) group is an association between a pair of HBase clusters, a group of
 * clients, and an HA policy.
 * 
 * This class is thread safe. Multiple threads may access an instance of this class, including
 * multiple clients that call init in order to create a connection, two cluster role managers that
 * watches node changes in ZooKeeper.
 * 

 * The lifecycle of an HA group is confined in the global cache, meaning clients can get an instance
 * from the cache but cannot construct or close an HA group instance.  The reason is that HA group
 * is a shared resource by many clients.  Closing it intentionally or accidentally by a client will
 * impact other connections in this group with unexpected behavior.
 */
@SuppressWarnings("UnstableApiUsage")
public class HighAvailabilityGroup {
    public static final String PHOENIX_HA_ATTR_PREFIX = "phoenix.ha.";
    public static final String PHOENIX_HA_GROUP_ATTR = PHOENIX_HA_ATTR_PREFIX + "group.name";
    /**
     * Should we fall back to single cluster when cluster role record is missing?
     */
    public static final String PHOENIX_HA_SHOULD_FALLBACK_WHEN_MISSING_CRR_KEY =
            PHOENIX_HA_ATTR_PREFIX + "fallback.enabled";
    public static final String PHOENIX_HA_SHOULD_FALLBACK_WHEN_MISSING_CRR_DEFAULT =
            String.valueOf(Boolean.TRUE);
    /**
     * The single-cluster connection URL when it needs to fall back.
     */
    public static final String PHOENIX_HA_FALLBACK_CLUSTER_KEY =
            PHOENIX_HA_ATTR_PREFIX + "fallback.cluster";
    public static final String PHOENIX_HA_ZOOKEEPER_ZNODE_NAMESPACE =
            "phoenix" + ZKPaths.PATH_SEPARATOR + "ha";

    public static final String PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_KEY =
            PHOENIX_HA_ATTR_PREFIX + "zk.connection.timeout.ms";
    public static final int PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_DEFAULT = 4_000;
    public static final String PHOENIX_HA_ZK_SESSION_TIMEOUT_MS_KEY =
            PHOENIX_HA_ATTR_PREFIX + "zk.session.timeout.ms";
    public static final int PHOENIX_HA_ZK_SESSION_TIMEOUT_MS_DEFAULT = 4_000;
    public static final String PHOENIX_HA_ZK_RETRY_BASE_SLEEP_MS_KEY =
            PHOENIX_HA_ATTR_PREFIX + "zk.retry.base.sleep.ms";

    public static final int PHOENIX_HA_ZK_RETRY_BASE_SLEEP_MS_DEFAULT = 1000;
    public static final String PHOENIX_HA_ZK_RETRY_MAX_KEY =
            PHOENIX_HA_ATTR_PREFIX + "zk.retry.max";
    public static final int PHOENIX_HA_ZK_RETRY_MAX_DEFAULT = 5;
    public static final String PHOENIX_HA_ZK_RETRY_MAX_SLEEP_MS_KEY =
            PHOENIX_HA_ATTR_PREFIX + "zk.retry.max.sleep.ms";
    public static final int PHOENIX_HA_ZK_RETRY_MAX_SLEEP_MS_DEFAULT = 10_000;
    public static final RetryPolicy RETRY_POLICY = new ExponentialBackoffRetry(
            PHOENIX_HA_ZK_RETRY_BASE_SLEEP_MS_DEFAULT,
            PHOENIX_HA_ZK_RETRY_MAX_DEFAULT,
            PHOENIX_HA_ZK_RETRY_MAX_SLEEP_MS_DEFAULT);

    public static final String PHOENIX_HA_TRANSITION_TIMEOUT_MS_KEY =
            PHOENIX_HA_ATTR_PREFIX + "transition.timeout.ms";
    public static final long PHOENIX_HA_TRANSITION_TIMEOUT_MS_DEFAULT = 5 * 60 * 1000; // 5 mins

    static final Logger LOG = LoggerFactory.getLogger(HighAvailabilityGroup.class);
    @VisibleForTesting
    static final Map GROUPS = new ConcurrentHashMap<>();
    @VisibleForTesting
    static final Cache MISSING_CRR_GROUPS_CACHE = CacheBuilder.newBuilder()
            .expireAfterWrite(PHOENIX_HA_TRANSITION_TIMEOUT_MS_DEFAULT, TimeUnit.MILLISECONDS)
            .build();
    /**
     * The Curator client cache, one client instance per cluster.
     */
    @VisibleForTesting
    static final Cache CURATOR_CACHE = CacheBuilder.newBuilder()
            .expireAfterAccess(DEFAULT_CLIENT_CONNECTION_CACHE_MAX_DURATION, TimeUnit.MILLISECONDS)
            .removalListener((notification) ->
                    ((CuratorFramework) Objects.requireNonNull(notification.getValue())).close())
            .build();
    /**
     * High availability group info.
     */
    private final HAGroupInfo info;
    /**
     * Client properties used to initialize this HA group.
     */
    private final Properties properties;
    /**
     * Executor service for the two role managers.
     */
    private final ExecutorService roleManagerExecutor = Executors.newFixedThreadPool(2,
            new ThreadFactoryBuilder().setDaemon(true).setNameFormat("phoenixHAGroup-%d").build());
    /**
     * The count down latch to make sure at least one role manager has pulled data from ZK.
     */
    private final CountDownLatch roleManagerLatch = new CountDownLatch(1);
    /**
     * Pair of role managers for watching cluster role records from the two ZK clusters.
     */
    private final AtomicReference> roleManagers
            = new AtomicReference<>();
    /**
     * Executor for applying the cluster role to this HA group.
     */
    private final ExecutorService nodeChangedExecutor = Executors.newFixedThreadPool(1);
    /**
     * Current cluster role record for this HA group.
     */
    private volatile ClusterRoleRecord roleRecord;
    /**
     * State of this HA group.
     */
    private volatile State state = State.UNINITIALIZED;

    /**
     * Private constructor.
     * 

     * To get an instance, please call {@link HighAvailabilityGroup#get(String, Properties)}.
     */
    private HighAvailabilityGroup(HAGroupInfo info, Properties properties) {
        this.info = info;
        this.properties = properties;
    }
    /**
     * This is for test usage only. In production, the record should be retrieved from ZooKeeper.
     */
    @VisibleForTesting
    HighAvailabilityGroup(HAGroupInfo info, Properties properties, ClusterRoleRecord record,
                          State state) {
        this.info = info;
        this.properties = properties;
        this.roleRecord = record;
        this.state = state;
    }

    public static HAGroupInfo getHAGroupInfo(String url, Properties properties)
            throws SQLException {
        if (url.startsWith(PhoenixRuntime.JDBC_PROTOCOL)) {
            url = url.substring(PhoenixRuntime.JDBC_PROTOCOL.length() + 1);
        }
        if (!(url.contains("[") && url.contains("|") && url.contains("]"))) {
            throw new SQLExceptionInfo.Builder(SQLExceptionCode.MALFORMED_CONNECTION_URL)
                    .setMessage(String.format("URL %s is not a valid HA connection string", url))
                    .build()
                    .buildException();
        }
        String additionalJDBCParams = null;
        int idx = url.indexOf("]");
        int extraIdx = url.indexOf(PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR, idx + 1);
        if (extraIdx != -1) {
            // skip the JDBC_PROTOCOL_SEPARATOR
            additionalJDBCParams  = url.substring(extraIdx + 1);
        }

        url = url.substring(url.indexOf("[") + 1, url.indexOf("]"));
        String[] urls = url.split("\\|");

        String name = properties.getProperty(PHOENIX_HA_GROUP_ATTR);
        if (StringUtils.isEmpty(name)) {
            throw new SQLExceptionInfo.Builder(SQLExceptionCode.HA_INVALID_PROPERTIES)
                    .setMessage(String.format("HA group name can not be empty for HA URL %s", url))
                    .build()
                    .buildException();
        }
        return new HAGroupInfo(name, urls[0], urls[1], additionalJDBCParams);
    }

    /**
     * Get an instance of HA group given the HA connecting URL (with "|") and client properties.
     * 

     * The HA group does not have a public constructor. This method is the only public one for
     * getting an HA group instance. The reason is that, HA group is considered expensive to create
     * and maintain. Caching it will make it reusable for all connection requests to this group.
     * 

     * It will return the cached instance, if any, for the target HA group. The HA group creation
     * and initialization are blocking operations. Upon initialization failure, the HA group
     * information may be saved in a negative cache iff the cause is due to missing cluster role
     * records. In presence of empty (not null or exception) return value, client may choose to fall
     * back to a single cluster connection to compensate missing cluster role records.
     *
     * @return Optional of target HA group (initialized), or empty if missing cluster role records
     * @throws SQLException fails to get or initialize an HA group
     */
    public static Optional get(String url, Properties properties)
            throws SQLException {
        HAGroupInfo info = getHAGroupInfo(url, properties);
        if (MISSING_CRR_GROUPS_CACHE.getIfPresent(info) != null) {
            return Optional.empty();
        }

        HighAvailabilityGroup haGroup = GROUPS.computeIfAbsent(info,
                haGroupInfo -> new HighAvailabilityGroup(haGroupInfo, properties));
        try {
            haGroup.init();
        } catch (Exception e) {
            GROUPS.remove(info);
            haGroup.close();
            try {
                CuratorFramework curator1 = CURATOR_CACHE.getIfPresent(info.getUrl1());
                CuratorFramework curator2 = CURATOR_CACHE.getIfPresent(info.getUrl2());
                if (curator1 != null && curator2 != null) {
                    Stat node1 = curator1.checkExists().forPath(info.getZkPath());
                    Stat node2 = curator2.checkExists().forPath(info.getZkPath());
                    if (node1 == null && node2 == null) {
                        // The HA group fails to initialize due to missing cluster role records on
                        // both ZK clusters. We will put this HA group into negative cache.
                        MISSING_CRR_GROUPS_CACHE.put(info, true);
                        return Optional.empty();
                    }
                }
            } catch (Exception e2) {
                LOG.error("HA group {} failed to initialized. Got exception when checking if znode"
                        + " exists on the two ZK clusters.", info, e2);
            }
            throw new SQLExceptionInfo.Builder(SQLExceptionCode.CANNOT_ESTABLISH_CONNECTION)
                    .setMessage(String.format("Cannot start HA group %s for URL %s", haGroup, url))
                    .setRootCause(e)
                    .build()
                    .buildException();
        }
        return Optional.of(haGroup);
    }

    /**
     * This method helps client to get the single cluster to fallback.
     * 

     * When getting HA group using {@link #get(String, Properties)}, it may return empty (not null
     * or exception) value. In that case client may choose to fall back to a single cluster
     * connection to compensate missing cluster role records instead of throw errors.
     *
     * @param url        The HA connection url optionally; empty optional if properties disables fallback
     * @param properties The client connection properties
     * @return The connection url of the single cluster to fall back
     * @throws SQLException if fails to get HA information and/or invalid properties are seen
     */
    static Optional getFallbackCluster(String url, Properties properties) throws SQLException {
        HAGroupInfo haGroupInfo = getHAGroupInfo(url, properties);

        String fallback = properties.getProperty(PHOENIX_HA_SHOULD_FALLBACK_WHEN_MISSING_CRR_KEY,
                PHOENIX_HA_SHOULD_FALLBACK_WHEN_MISSING_CRR_DEFAULT);
        if (!Boolean.parseBoolean(fallback)) {
            LOG.info("Fallback to single cluster not enabled for the HA group {} per configuration."
                    + " HA url: '{}'.", haGroupInfo.getName(), url);
            return Optional.empty();
        }
        String fallbackCluster = properties.getProperty(PHOENIX_HA_FALLBACK_CLUSTER_KEY);
        if (StringUtils.isEmpty(fallbackCluster)) {
            fallbackCluster = haGroupInfo.getUrl1();
        }
        LOG.info("Falling back to single cluster '{}' for the HA group {} to serve HA connection "
                        + "request against url '{}'.",
                fallbackCluster, haGroupInfo.getName(), url);
        return Optional.of(fallbackCluster);
    }

    /**
     * Get an active curator ZK client for the given properties and ZK endpoint.
     * 

     * This can be from cached object since Curator should be shared per cluster.
     *
     * @param jdbcUrl    the ZK endpoint host:port or the JDBC connection String host:port:/hbase
     * @param properties the properties defining time out values and retry count
     * @return a new Curator framework client
     */
    @SuppressWarnings("UnstableApiUsage")
    public static CuratorFramework getCurator(String jdbcUrl, Properties properties)
            throws IOException {
        try {
            return CURATOR_CACHE.get(jdbcUrl, () -> {
                CuratorFramework curator = createCurator(jdbcUrl, properties);
                if (!curator.blockUntilConnected(PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_DEFAULT,
                        TimeUnit.MILLISECONDS))
                    throw new RuntimeException("Failed to connect to the CuratorFramework in "
                            + "timeout " + PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_DEFAULT + " ms");
                return curator;
            });
        } catch (Exception e) {
            LOG.error("Fail to get an active curator for url {}", jdbcUrl, e);
            // invalidate the cache when getting/creating throws exception
            CURATOR_CACHE.invalidate(jdbcUrl);
            throw new IOException(e);
        }
    }

    /**
     * Create a curator ZK client for the given properties and ZK endpoint.
     * 

     * Unless caller needs a new curator, it should use {@link #getCurator(String, Properties)}.
     */
    private static CuratorFramework createCurator(String jdbcUrl, Properties properties) {
        // Get the ZK endpoint in host:port format by removing JDBC protocol and HBase root node
        final String zkUrl;
        if (jdbcUrl.startsWith(PhoenixRuntime.JDBC_PROTOCOL)) {
            jdbcUrl = jdbcUrl.substring(PhoenixRuntime.JDBC_PROTOCOL.length() + 1);
        }
        Preconditions.checkArgument(!StringUtils.isEmpty(jdbcUrl), "JDBC url is empty!");
        jdbcUrl = jdbcUrl.replaceAll("\\\\:", "=");
        String[] parts = jdbcUrl.split(":");
        if (parts.length == 0 || parts.length > 3) {
            throw new IllegalArgumentException("Invalid JDBC url!" + jdbcUrl);
        }
        // The URL is already normalised
        zkUrl = parts[0].replaceAll("=", ":");

        // Get timeout and retry counts
        String connectionTimeoutMsProp = properties.getProperty(
                PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_KEY);
        final int connectionTimeoutMs = !StringUtils.isEmpty(connectionTimeoutMsProp)
                ? Integer.parseInt(connectionTimeoutMsProp)
                : PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_DEFAULT;
        String sessionTimeoutMsProps = properties.getProperty(PHOENIX_HA_ZK_SESSION_TIMEOUT_MS_KEY);
        final int sessionTimeoutMs = !StringUtils.isEmpty(sessionTimeoutMsProps)
                ? Integer.parseInt(sessionTimeoutMsProps)
                : PHOENIX_HA_ZK_SESSION_TIMEOUT_MS_DEFAULT;
        final RetryPolicy retryPolicy = createRetryPolicy(properties);

        CuratorFramework curator = CuratorFrameworkFactory
                .builder()
                .connectString(zkUrl)
                .namespace(PHOENIX_HA_ZOOKEEPER_ZNODE_NAMESPACE)
                .connectionTimeoutMs(connectionTimeoutMs)
                .sessionTimeoutMs(sessionTimeoutMs)
                .retryPolicy(retryPolicy)
                .canBeReadOnly(true)
                .build();
        curator.start();
        return curator;
    }

    /**
     * Create a Curator retry policy from properties.
     * 

     * If properties is null, return a default retry policy.
     *
     * @param properties properties defining timeout and max retries
     * @return a retry policy which can be used for Curator operations
     */
    public static RetryPolicy createRetryPolicy(Properties properties) {
        if (properties == null) {
            return RETRY_POLICY;
        }
        String baseSleepTimeMsProp = properties.getProperty(PHOENIX_HA_ZK_RETRY_BASE_SLEEP_MS_KEY);
        int baseSleepTimeMs = StringUtils.isNotEmpty(baseSleepTimeMsProp)
                ? Integer.parseInt(baseSleepTimeMsProp)
                : PHOENIX_HA_ZK_RETRY_BASE_SLEEP_MS_DEFAULT;
        String maxRetriesProp = properties.getProperty(PHOENIX_HA_ZK_RETRY_MAX_KEY);
        int maxRetries = StringUtils.isNotEmpty(maxRetriesProp)
                ? Integer.parseInt(maxRetriesProp)
                : PHOENIX_HA_ZK_RETRY_MAX_DEFAULT;
        String maxSleepTimeMsProp = properties.getProperty(PHOENIX_HA_ZK_RETRY_MAX_SLEEP_MS_KEY);
        int maxSleepTimeMs = StringUtils.isNotEmpty(maxSleepTimeMsProp)
                ? Integer.parseInt(maxSleepTimeMsProp)
                : PHOENIX_HA_ZK_RETRY_MAX_SLEEP_MS_DEFAULT;
        return new ExponentialBackoffRetry(baseSleepTimeMs, maxRetries, maxSleepTimeMs);
    }

    /**
     * Initialize this HA group by registering ZK watchers and getting initial cluster role record.
     * 

     * If this is already initialized, calling this method is a no-op. This method is lock free as
     * current thread will either return fast or wait for the in-progress initialization or timeout.
     */
    public void init() throws IOException {
        if (state != State.UNINITIALIZED) {
            return;
        }

        PairOfSameType newRoleManagers = new PairOfSameType<>(
                new HAClusterRoleManager(info.urls.getFirst(), properties),
                new HAClusterRoleManager(info.urls.getSecond(), properties));
        if (!roleManagers.compareAndSet(null, newRoleManagers)) {
            LOG.info("Someone already started role managers; waiting for that one...");
            waitForInitialization(properties);
            return;
        }

        Future f1 = roleManagerExecutor.submit(newRoleManagers.getFirst());
        Future f2 = roleManagerExecutor.submit(newRoleManagers.getSecond());
        try {
            waitForInitialization(properties);
        } catch (IOException e) {
            // HA group that fails to initialize will not be kept in the global cache.
            // Next connection request will create and initialize a new HA group.
            // Before returning in case of exception, following code will cancel the futures.
            f1.cancel(true);
            f2.cancel(true);
            throw e;
        }

        assert roleRecord != null;
        LOG.info("Initial cluster role for HA group {} is {}", info, roleRecord);
    }

    /**
     * Helper method that will block current thread until the HA group is initialized.
     * 

     * After returning, the HA state might not be in READY state. That is possible when a new ZK
     * node change is detected triggering HA group to become IN_TRANSIT state.
     *
     * @param properties the connection properties
     * @throws IOException when current HA group is not initialized before timeout
     */
    private void waitForInitialization(Properties properties) throws IOException {
        String connectionTimeoutMsProp = properties.getProperty(
                PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_KEY);
        int timeout = !StringUtils.isEmpty(connectionTimeoutMsProp)
                ? Integer.parseInt(connectionTimeoutMsProp)
                : PHOENIX_HA_ZK_CONNECTION_TIMEOUT_MS_DEFAULT;
        boolean started = false;
        try {
            started = roleManagerLatch.await(timeout, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            LOG.warn("Got interrupted when waiting for cluster role managers to start", e);
            Thread.currentThread().interrupt();
        }
        if (!started) {
            LOG.warn("Timed out {}ms waiting for HA group '{}' to be initialized.", timeout, info);
            throw new IOException("Fail to initialize HA group " + info);
        }
    }

    /**
     * Create a JDBC connection in this high availability group.
     *
     * @param properties connection properties
     * @return a JDBC connection implementation
     * @throws SQLException if fails to connect a JDBC connection
     */
    public Connection connect(Properties properties) throws SQLException {
        if (state != State.READY) {
            throw new SQLExceptionInfo
                    .Builder(SQLExceptionCode.CANNOT_ESTABLISH_CONNECTION)
                    .setMessage("HA group is not ready!")
                    .setHaGroupInfo(info.toString())
                    .build()
                    .buildException();
        }
        return roleRecord.getPolicy().provide(this, properties);
    }

    /**
     * Get a Phoenix connection against the current active HBase cluster.
     * 

     * If there is no active cluster, it will throw exception instead of blocking or retrying.
     *
     * @param properties connection properties
     * @return a Phoenix connection to current active HBase cluster
     * @throws SQLException if fails to get a connection
     */
    PhoenixConnection connectActive(final Properties properties) throws SQLException {
        try {
            Optional url = roleRecord.getActiveUrl();
            if (state == State.READY && url.isPresent()) {
                PhoenixConnection conn = connectToOneCluster(url.get(), properties);
                // After connection is created, double check if the cluster is still ACTIVE
                // This is to make sure the newly created connection will not be returned to client
                // if the target cluster is not active any more. This can happen during failover.
                boolean isActive;
                try {
                    isActive = isActive(conn);
                } catch (Exception e) {
                    conn.close();
                    throw e;
                }

                if (state == State.READY && isActive) {
                    return conn;
                } else {
                    conn.close();
                    throw new SQLExceptionInfo
                            .Builder(SQLExceptionCode.HA_CLOSED_AFTER_FAILOVER)
                            .setMessage("Cluster is not active any more in HA group. Please retry.")
                            .setHaGroupInfo(info.toString())
                            .build()
                            .buildException();
                }
            } else {
                LOG.error("Not able to connect to active cluster, state: {}, active exist: {}",
                        state, url.isPresent());
                throw new SQLExceptionInfo
                        .Builder(SQLExceptionCode.HA_NO_ACTIVE_CLUSTER)
                        .setMessage("Cannot connect to HA group because it has no active cluster")
                        .setHaGroupInfo(info.toString())
                        .build()
                        .buildException();
            }
        } catch (SQLException e) {
            LOG.error("Failed to connect to active cluster in HA group {}, record: {}", info,
                    roleRecord, e);
            throw new SQLExceptionInfo
                    .Builder(SQLExceptionCode.CANNOT_ESTABLISH_CONNECTION)
                    .setMessage("Failed to connect to active cluster in HA group")
                    .setHaGroupInfo(info.toString())
                    .setRootCause(e)
                    .build()
                    .buildException();
        }
    }

    /**
     * @return true if the given phoenix connection points to ACTIVE cluster, else false
     */
    boolean isActive(PhoenixConnection connection) {
        if (state != State.READY || connection == null) {
            return false;
        }
        return roleRecord.getActiveUrl()
                .equals(Optional.of(JDBCUtil.formatZookeeperUrl(connection.getURL())));
    }

    /**
     * Connect to an HBase cluster in this HA group with given url and client properties.
     * 

     * The URL should belong to one of the two ZK clusters in this HA group. It returns the Phoenix
     * connection to the given cluster without checking the context of the cluster's role. Please
     * use {@link #connectActive(Properties)} to connect to the ACTIVE cluster.
     */
    PhoenixConnection connectToOneCluster(String url, Properties properties) throws SQLException {
        Preconditions.checkNotNull(url);
        if (url.startsWith(PhoenixRuntime.JDBC_PROTOCOL)) {
            Preconditions.checkArgument(url.length() > PhoenixRuntime.JDBC_PROTOCOL.length(),
                    "The URL '" + url + "' is not a valid Phoenix connection string");
        }
        url = JDBCUtil.formatZookeeperUrl(url);
        Preconditions.checkArgument(url.equals(info.getUrl1()) || url.equals(info.getUrl2()),
                "The URL '" + url + "' does not belong to this HA group " + info);

        String jdbcString = info.getJDBCUrl(url);

        ClusterRole role = roleRecord.getRole(url);
        if (!role.canConnect()) {
            throw new SQLExceptionInfo.Builder(SQLExceptionCode.HA_CLUSTER_CAN_NOT_CONNECT)
                    .setMessage("Can not connect to cluster '" + url + "' in '" + role + "' role")
                    .build()
                    .buildException();
        }

        // Get driver instead of using PhoenixDriver.INSTANCE since it can be test or mocked driver
        Driver driver = DriverManager.getDriver(jdbcString);
        Preconditions.checkArgument(driver instanceof PhoenixEmbeddedDriver,
                "No JDBC driver is registered for Phoenix high availability (HA) framework");
        return ((PhoenixEmbeddedDriver) driver).getConnectionQueryServices(jdbcString, properties)
                .connect(jdbcString, properties);
    }

    @VisibleForTesting
    HAGroupInfo getGroupInfo() {
        return info;
    }

    Properties getProperties() {
        return properties;
    }

    public ClusterRoleRecord getRoleRecord() {
        return roleRecord;
    }

    /**
     * Package private close method.
     * 

     * Once this HA group is closed, it can not be re-opened again. Use a new object if necessary.
     * This method is package private because we do not want to expose the lifecycle management
     * methods to public. Constructor is also private (or package-private visible for testing).
     * The lifecycle management is confined to this class because an HA group is a shared resource.
     * Someone calling close on this would make it unusable, since the state would become closed.
     */
    void close() {
        roleManagerExecutor.shutdownNow();
        try {
            // TODO: Parameterize and set in future work item for pluggable
            if (!roleManagerExecutor.awaitTermination(PHOENIX_HA_ZK_SESSION_TIMEOUT_MS_DEFAULT,
                    TimeUnit.MILLISECONDS)) {
                LOG.error("Fail to shut down role managers service for HA group: {}", info);
            }
        } catch (InterruptedException e) {
            LOG.warn("HA group {} close() got interrupted when closing role managers", info, e);
            // (Re-)Cancel if current thread also interrupted
            roleManagerExecutor.shutdownNow();
            // Preserve interrupt status
            Thread.currentThread().interrupt();
        }
        state = State.CLOSED;
    }

    @Override
    public String toString() {
        return roleRecord == null
                ? "HighAvailabilityGroup{roleRecord=null, info=" + info + ", state=" + state + "}"
                : "HighAvailabilityGroup{roleRecord=" + roleRecord + ", state=" + state + "}";
    }

    /**
     * Set the new cluster role record for this HA group.
     * 

     * Calling this method will make HA group be in transition state where no request can be served.
     * The data source may come from either of the two clusters as seen by the ZK watcher.
     *
     * @param newRoleRecord the new cluster role record to set
     * @return true if the new record is set as current one; false otherwise
     */
    private synchronized boolean applyClusterRoleRecord(@NonNull ClusterRoleRecord newRoleRecord) {
        if (roleRecord == null) {
            roleRecord = newRoleRecord;
            state = State.READY;
            LOG.info("HA group {} is now in {} state after getting initial V{} role record: {}",
                    info, state, roleRecord.getVersion(), roleRecord);
            LOG.debug("HA group {} is ready", this);
            return true;
        }

        if (!newRoleRecord.isNewerThan(roleRecord)) {
            LOG.warn("Does not apply new cluster role record as it does not have higher version. "
                    + "Existing record: {}, new record: {}", roleRecord, newRoleRecord);
            return false;
        }

        if (!roleRecord.hasSameInfo(newRoleRecord)) {
            LOG.error("New record {} has different HA group information from old record {}",
                    newRoleRecord, roleRecord);
            return false;
        }

        final ClusterRoleRecord oldRecord = roleRecord;
        state = State.IN_TRANSITION;
        LOG.info("HA group {} is in {} to set V{} record", info, state, newRoleRecord.getVersion());
        Future future = nodeChangedExecutor.submit(() -> {
            try {
                roleRecord.getPolicy().transitClusterRole(this, roleRecord, newRoleRecord);
            } catch (SQLException e) {
                throw new CompletionException(e);
            }
        });

        // TODO: save timeout in the HA group info (aka cluster role record) instead in properties
        String transitionTimeoutProp = properties.getProperty(PHOENIX_HA_TRANSITION_TIMEOUT_MS_KEY);
        long maxTransitionTimeMs = StringUtils.isNotEmpty(transitionTimeoutProp)
                ? Long.parseLong(transitionTimeoutProp)
                : PHOENIX_HA_TRANSITION_TIMEOUT_MS_DEFAULT;
        try {
            future.get(maxTransitionTimeMs, TimeUnit.MILLISECONDS);
        } catch (InterruptedException ie) {
            LOG.error("Got interrupted when transiting cluster roles for HA group {}", info, ie);
            future.cancel(true);
            Thread.currentThread().interrupt();
            return false;
        } catch (ExecutionException | TimeoutException e) {
            LOG.error("HA group {} failed to transit cluster roles per policy {} to new record {}",
                    info, roleRecord.getPolicy(), newRoleRecord, e);
            // Calling back HA policy function for cluster switch is conducted with best effort.
            // HA group continues transition when its HA policy fails to deal with context switch
            // (e.g. to close existing connections)
            // The goal here is to gain higher availability even though existing resources against
            // previous ACTIVE cluster may have not been closed cleanly.
        }
        roleRecord = newRoleRecord;
        state = State.READY;
        LOG.info("HA group {} is in {} state after applying V{} role record. Old: {}, new: {}",
                info, state, roleRecord.getVersion(), oldRecord, roleRecord);
        LOG.debug("HA group is ready: {}", this);
        return true;
    }

    /**
     * Local state of this HA group object, which transits upon explicit call (e.g. init) or when
     * the cluster role change is detected.
     * 

     * - UNINITIALIZED is the state when this HA group has not been initialized. Once the HA group
     * is initialized, it will never go to this state again.
     * - READY is the state when this HA group can serve client request. There is not necessarily
     * an active HBase cluster since a standby cluster may be sufficient per HA policy.
     * - IN_TRANSITION is the state where HA group is dealing with cluster role changes and all
     * client connection requests are rejected.
     * - CLOSED is the state where the HA group is closed. Once the HA group is closed, it will
     * never leave this state.
     */
    enum State {UNINITIALIZED, READY, IN_TRANSITION, CLOSED}

    /**
     * An HAGroupInfo contains information of an HA group.
     * 

     * It is constructed based on client input, including the JDBC connection string and properties.
     * Objects of this class are used as the keys of HA group cache {@link #GROUPS}.
     * 

     * This class is immutable.
     */
    @VisibleForTesting
    static final class HAGroupInfo {
        private final String name;
        private final PairOfSameType urls;
        private final String additionalJDBCParams;

        HAGroupInfo(String name, String url1, String url2, String additionalJDBCParams) {
            Preconditions.checkNotNull(name);
            Preconditions.checkNotNull(url1);
            Preconditions.checkNotNull(url2);
            this.name = name;
            url1 = JDBCUtil.formatZookeeperUrl(url1);
            url2 = JDBCUtil.formatZookeeperUrl(url2);
            Preconditions.checkArgument(!url1.equals(url2), "Two clusters have the same ZK!");
            // Ignore the given order of url1 and url2, and reorder for equals comparison.
            if (url1.compareTo(url2) > 0) {
                this.urls = new PairOfSameType<>(url2, url1);
            } else {
                this.urls = new PairOfSameType<>(url1, url2);
            }
            this.additionalJDBCParams = additionalJDBCParams;
        }

        HAGroupInfo(String name, String url1, String url2) {
            this(name, url1, url2, null);
        }

        public String getName() {
            return name;
        }

        public String getUrl1() {
            return urls.getFirst();
        }

        public String getUrl2() {
            return urls.getSecond();
        }

        public String getJDBCUrl(String zkUrl) {
            Preconditions.checkArgument(zkUrl.equals(getUrl1()) || zkUrl.equals(getUrl2()),
                    "The URL '" + zkUrl + "' does not belong to this HA group " + this);
            StringBuilder sb = new StringBuilder();
            sb.append(PhoenixRuntime.JDBC_PROTOCOL_ZK);
            sb.append(PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR);
            sb.append(zkUrl);
            if (!Strings.isNullOrEmpty(additionalJDBCParams)) {
                sb.append(PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR);
                sb.append(additionalJDBCParams);
            }
            return sb.toString();
        }

        public String getJDBCUrl1() {
            return getJDBCUrl(getUrl1());
        }

        public String getJDBCUrl2() {
            return getJDBCUrl(getUrl2());
        }

        /**
         * Helper method to return the znode path in the Phoenix HA namespace.
         */
        String getZkPath() {
            return ZKPaths.PATH_SEPARATOR + name;
        }

        @Override
        public String toString() {
            return String.format("%s[%s|%s]", name, urls.getFirst(), urls.getSecond());
        }

        @Override
        public boolean equals(Object other) {
            if (other == null) {
                return false;
            }
            if (other == this) {
                return true;
            }
            if (other.getClass() != getClass()) {
                return false;
            }
            HAGroupInfo otherInfo = (HAGroupInfo) other;
            return new EqualsBuilder()
                    .append(name, otherInfo.name)
                    .append(urls, otherInfo.urls)
                    .isEquals();
        }

        @Override
        public int hashCode() {
            return new HashCodeBuilder(17, 37)
                    .append(name)
                    .append(urls).hashCode();
        }
    }

    /**
     * Maintains the client view of cluster roles for the HA group using data retrieved from one ZK.
     * 
     * It is a runnable to keep setting up the curator and the node cache. It will also register
     * the node watcher so any znode data change will trigger a callback function updating HA group.
     */
    private final class HAClusterRoleManager implements Runnable {
        private final String jdbcUrl;
        private final Properties properties;
        private NodeCache cache;

        /**
         * Constructor which creates and starts the ZK watcher.
         *
         * @param jdbcUrl    JDBC url without jdbc:phoenix prefix which may be host:port:/hbase format
         * @param properties The properties defining ZK client timeouts and retries
         */
        HAClusterRoleManager(String jdbcUrl, Properties properties) {
            this.jdbcUrl = jdbcUrl;
            this.properties = properties;
        }

        @Override
        public void run() {
            final String zpath = info.getZkPath();
            while (!Thread.currentThread().isInterrupted()) {
                try {
                    cache = new NodeCache(getCurator(jdbcUrl, properties), zpath);
                    cache.getListenable().addListener(this::nodeChanged);
                    cache.start();
                    return; // return after building the initial node cache
                } catch (InterruptedException e) {
                    LOG.warn("HA cluster role manager thread for '{}' is interrupted, exiting",
                            jdbcUrl, e);
                    break;
                } catch (Throwable t) {
                    LOG.warn("Fail to start node cache on '{}' for '{}'. Retry", jdbcUrl, zpath, t);
                    try {
                        // TODO: do better than fixed time sleep
                        Thread.sleep(1_000);
                    } catch (InterruptedException e) {
                        LOG.warn("HA cluster role manager thread for '{}' is interrupted, exiting",
                                jdbcUrl, e);
                        break;
                    }
                }
            }
        }

        /**
         * Call back functions when a cluster role change is notified by this ZK cluster.
         */
        private void nodeChanged() {
            byte[] data = cache.getCurrentData().getData();
            Optional newRecordOptional = ClusterRoleRecord.fromJson(data);
            if (!newRecordOptional.isPresent()) {
                LOG.error("Fail to deserialize new record; keep current record {}", roleRecord);
                return;
            }
            ClusterRoleRecord newRecord = newRecordOptional.get();
            LOG.info("HA group {} got a record from cluster {}: {}", info.name, jdbcUrl, newRecord);

            if (applyClusterRoleRecord(newRecord)) {
                LOG.info("Successfully apply new cluster role record from cluster '{}', "
                        + "new record: {}", jdbcUrl, newRecord);
                roleManagerLatch.countDown();
            }
        }
    }
}