View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import java.io.Closeable;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.lang.reflect.Constructor;
25  import java.lang.reflect.UndeclaredThrowableException;
26  import java.net.SocketException;
27  import java.util.ArrayList;
28  import java.util.Date;
29  import java.util.HashSet;
30  import java.util.LinkedHashMap;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.Map.Entry;
34  import java.util.NavigableMap;
35  import java.util.Set;
36  import java.util.concurrent.ConcurrentHashMap;
37  import java.util.concurrent.ConcurrentMap;
38  import java.util.concurrent.ConcurrentSkipListMap;
39  import java.util.concurrent.ConcurrentSkipListSet;
40  import java.util.concurrent.CopyOnWriteArraySet;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.LinkedBlockingQueue;
43  import java.util.concurrent.ThreadPoolExecutor;
44  import java.util.concurrent.TimeUnit;
45  import java.util.concurrent.atomic.AtomicBoolean;
46  import java.util.concurrent.atomic.AtomicInteger;
47  
48  import org.apache.commons.logging.Log;
49  import org.apache.commons.logging.LogFactory;
50  import org.apache.hadoop.hbase.classification.InterfaceAudience;
51  import org.apache.hadoop.hbase.classification.InterfaceStability;
52  import org.apache.hadoop.conf.Configuration;
53  import org.apache.hadoop.hbase.Chore;
54  import org.apache.hadoop.hbase.HBaseConfiguration;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HRegionLocation;
58  import org.apache.hadoop.hbase.HTableDescriptor;
59  import org.apache.hadoop.hbase.MasterNotRunningException;
60  import org.apache.hadoop.hbase.RegionTooBusyException;
61  import org.apache.hadoop.hbase.ServerName;
62  import org.apache.hadoop.hbase.Stoppable;
63  import org.apache.hadoop.hbase.TableName;
64  import org.apache.hadoop.hbase.TableNotEnabledException;
65  import org.apache.hadoop.hbase.TableNotFoundException;
66  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
67  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
68  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
69  import org.apache.hadoop.hbase.client.coprocessor.Batch;
70  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
71  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
72  import org.apache.hadoop.hbase.ipc.RpcClient;
73  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
74  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
75  import org.apache.hadoop.hbase.protobuf.RequestConverter;
76  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
77  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService;
78  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
79  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
80  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
81  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
82  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
83  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
84  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
85  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
86  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
87  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
88  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
89  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
90  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
91  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
92  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
93  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
94  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
95  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
96  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
97  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
98  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
99  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
100 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
101 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
102 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
103 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
104 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
105 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
106 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
107 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
108 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
109 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
110 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
111 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
112 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
113 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
114 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
115 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
116 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.*;
117 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
118 import org.apache.hadoop.hbase.security.User;
119 import org.apache.hadoop.hbase.security.UserProvider;
120 import org.apache.hadoop.hbase.util.Bytes;
121 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
122 import org.apache.hadoop.hbase.util.ExceptionUtil;
123 import org.apache.hadoop.hbase.util.Threads;
124 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
125 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
126 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
127 import org.apache.hadoop.ipc.RemoteException;
128 import org.apache.zookeeper.KeeperException;
129 
130 import com.google.common.annotations.VisibleForTesting;
131 import com.google.protobuf.BlockingRpcChannel;
132 import com.google.protobuf.RpcController;
133 import com.google.protobuf.ServiceException;
134 
135 /**
136  * A non-instantiable class that manages creation of {@link HConnection}s.
137  * <p>The simplest way to use this class is by using {@link #createConnection(Configuration)}.
138  * This creates a new {@link HConnection} to the cluster that is managed by the caller.
139  * From this {@link HConnection} {@link HTableInterface} implementations are retrieved
140  * with {@link HConnection#getTable(byte[])}. Example:
141  * <pre>
142  * {@code
143  * HConnection connection = HConnectionManager.createConnection(config);
144  * HTableInterface table = connection.getTable("table1");
145  * try {
146  *   // Use the table as needed, for a single operation and a single thread
147  * } finally {
148  *   table.close();
149  *   connection.close();
150  * }
151  * }</pre>
152  * <p>This class has a static Map of {@link HConnection} instances keyed by
153  * {@link HConnectionKey}; A {@link HConnectionKey} is identified by a set of
154  * {@link Configuration} properties. Invocations of {@link #getConnection(Configuration)}
155  * that pass the same {@link Configuration} instance will return the same
156  * {@link  HConnection} instance ONLY WHEN the set of properties are the same
157  * (i.e. if you change properties in your {@link Configuration} instance, such as RPC timeout,
158  * the codec used, HBase will create a new {@link HConnection} instance. For more details on
159  * how this is done see {@link HConnectionKey}).
160  * <p>Sharing {@link HConnection} instances is usually what you want; all clients
161  * of the {@link HConnection} instances share the HConnections' cache of Region
162  * locations rather than each having to discover for itself the location of meta, etc.
163  * But sharing connections makes clean up of {@link HConnection} instances a little awkward.
164  * Currently, clients cleanup by calling {@link #deleteConnection(Configuration)}. This will
165  * shutdown the zookeeper connection the HConnection was using and clean up all
166  * HConnection resources as well as stopping proxies to servers out on the
167  * cluster. Not running the cleanup will not end the world; it'll
168  * just stall the closeup some and spew some zookeeper connection failed
169  * messages into the log.  Running the cleanup on a {@link HConnection} that is
170  * subsequently used by another will cause breakage so be careful running
171  * cleanup.
172  * <p>To create a {@link HConnection} that is not shared by others, you can
173  * set property "hbase.client.instance.id" to a unique value for your {@link Configuration}
174  * instance, like the following:
175  * <pre>
176  * {@code
177  * conf.set("hbase.client.instance.id", "12345");
178  * HConnection connection = HConnectionManager.getConnection(conf);
179  * // Use the connection to your hearts' delight and then when done...
180  * conf.set("hbase.client.instance.id", "12345");
181  * HConnectionManager.deleteConnection(conf, true);
182  * }
183  * </pre>
184  * <p>Cleanup used to be done inside in a shutdown hook.  On startup we'd
185  * register a shutdown hook that called {@link #deleteAllConnections()}
186  * on its way out but the order in which shutdown hooks run is not defined so
187  * were problematic for clients of HConnection that wanted to register their
188  * own shutdown hooks so we removed ours though this shifts the onus for
189  * cleanup to the client.
190  */
191 @SuppressWarnings("serial")
192 @InterfaceAudience.Public
193 @InterfaceStability.Evolving
194 public class HConnectionManager {
195   static final Log LOG = LogFactory.getLog(HConnectionManager.class);
196 
197   public static final String RETRIES_BY_SERVER_KEY = "hbase.client.retries.by.server";
198   private static final String CLIENT_NONCES_ENABLED_KEY = "hbase.client.nonces.enabled";
199 
200   // An LRU Map of HConnectionKey -> HConnection (TableServer).  All
201   // access must be synchronized.  This map is not private because tests
202   // need to be able to tinker with it.
203   static final Map<HConnectionKey, HConnectionImplementation> CONNECTION_INSTANCES;
204 
205   public static final int MAX_CACHED_CONNECTION_INSTANCES;
206 
207   /**
208    * Global nonceGenerator shared per client.Currently there's no reason to limit its scope.
209    * Once it's set under nonceGeneratorCreateLock, it is never unset or changed.
210    */
211   private static volatile NonceGenerator nonceGenerator = null;
212   /** The nonce generator lock. Only taken when creating HConnection, which gets a private copy. */
213   private static Object nonceGeneratorCreateLock = new Object();
214 
215   static {
216     // We set instances to one more than the value specified for {@link
217     // HConstants#ZOOKEEPER_MAX_CLIENT_CNXNS}. By default, the zk default max
218     // connections to the ensemble from the one client is 30, so in that case we
219     // should run into zk issues before the LRU hit this value of 31.
220     MAX_CACHED_CONNECTION_INSTANCES = HBaseConfiguration.create().getInt(
221       HConstants.ZOOKEEPER_MAX_CLIENT_CNXNS, HConstants.DEFAULT_ZOOKEPER_MAX_CLIENT_CNXNS) + 1;
222     CONNECTION_INSTANCES = new LinkedHashMap<HConnectionKey, HConnectionImplementation>(
223         (int) (MAX_CACHED_CONNECTION_INSTANCES / 0.75F) + 1, 0.75F, true) {
224       @Override
225       protected boolean removeEldestEntry(
226           Map.Entry<HConnectionKey, HConnectionImplementation> eldest) {
227          return size() > MAX_CACHED_CONNECTION_INSTANCES;
228        }
229     };
230   }
231 
232   /*
233    * Non-instantiable.
234    */
235   private HConnectionManager() {
236     super();
237   }
238 
239   /**
240    * @param conn The connection for which to replace the generator.
241    * @param cnm Replaces the nonce generator used, for testing.
242    * @return old nonce generator.
243    */
244   @VisibleForTesting
245   public static NonceGenerator injectNonceGeneratorForTesting(
246       HConnection conn, NonceGenerator cnm) {
247     NonceGenerator ng = conn.getNonceGenerator();
248     LOG.warn("Nonce generator is being replaced by test code for " + cnm.getClass().getName());
249     ((HConnectionImplementation)conn).nonceGenerator = cnm;
250     return ng;
251   }
252 
253   /**
254    * Get the connection that goes with the passed <code>conf</code> configuration instance.
255    * If no current connection exists, method creates a new connection and keys it using
256    * connection-specific properties from the passed {@link Configuration}; see
257    * {@link HConnectionKey}.
258    * @param conf configuration
259    * @return HConnection object for <code>conf</code>
260    * @throws ZooKeeperConnectionException
261    */
262   @Deprecated
263   public static HConnection getConnection(final Configuration conf)
264   throws IOException {
265     HConnectionKey connectionKey = new HConnectionKey(conf);
266     synchronized (CONNECTION_INSTANCES) {
267       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
268       if (connection == null) {
269         connection = (HConnectionImplementation)createConnection(conf, true);
270         CONNECTION_INSTANCES.put(connectionKey, connection);
271       } else if (connection.isClosed()) {
272         HConnectionManager.deleteConnection(connectionKey, true);
273         connection = (HConnectionImplementation)createConnection(conf, true);
274         CONNECTION_INSTANCES.put(connectionKey, connection);
275       }
276       connection.incCount();
277       return connection;
278     }
279   }
280 
281   /**
282    * Create a new HConnection instance using the passed <code>conf</code> instance.
283    * <p>Note: This bypasses the usual HConnection life cycle management done by
284    * {@link #getConnection(Configuration)}. The caller is responsible for
285    * calling {@link HConnection#close()} on the returned connection instance.
286    *
287    * This is the recommended way to create HConnections.
288    * {@code
289    * HConnection connection = HConnectionManager.createConnection(conf);
290    * HTableInterface table = connection.getTable("mytable");
291    * table.get(...);
292    * ...
293    * table.close();
294    * connection.close();
295    * }
296    *
297    * @param conf configuration
298    * @return HConnection object for <code>conf</code>
299    * @throws ZooKeeperConnectionException
300    */
301   public static HConnection createConnection(Configuration conf)
302   throws IOException {
303     UserProvider provider = UserProvider.instantiate(conf);
304     return createConnection(conf, false, null, provider.getCurrent());
305   }
306 
307   /**
308    * Create a new HConnection instance using the passed <code>conf</code> instance.
309    * <p>Note: This bypasses the usual HConnection life cycle management done by
310    * {@link #getConnection(Configuration)}. The caller is responsible for
311    * calling {@link HConnection#close()} on the returned connection instance.
312    * This is the recommended way to create HConnections.
313    * {@code
314    * ExecutorService pool = ...;
315    * HConnection connection = HConnectionManager.createConnection(conf, pool);
316    * HTableInterface table = connection.getTable("mytable");
317    * table.get(...);
318    * ...
319    * table.close();
320    * connection.close();
321    * }
322    * @param conf configuration
323    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
324    * @return HConnection object for <code>conf</code>
325    * @throws ZooKeeperConnectionException
326    */
327   public static HConnection createConnection(Configuration conf, ExecutorService pool)
328   throws IOException {
329     UserProvider provider = UserProvider.instantiate(conf);
330     return createConnection(conf, false, pool, provider.getCurrent());
331   }
332 
333   /**
334    * Create a new HConnection instance using the passed <code>conf</code> instance.
335    * <p>Note: This bypasses the usual HConnection life cycle management done by
336    * {@link #getConnection(Configuration)}. The caller is responsible for
337    * calling {@link HConnection#close()} on the returned connection instance.
338    * This is the recommended way to create HConnections.
339    * {@code
340    * ExecutorService pool = ...;
341    * HConnection connection = HConnectionManager.createConnection(conf, pool);
342    * HTableInterface table = connection.getTable("mytable");
343    * table.get(...);
344    * ...
345    * table.close();
346    * connection.close();
347    * }
348    * @param conf configuration
349    * @param user the user the connection is for
350    * @return HConnection object for <code>conf</code>
351    * @throws ZooKeeperConnectionException
352    */
353   public static HConnection createConnection(Configuration conf, User user)
354   throws IOException {
355     return createConnection(conf, false, null, user);
356   }
357 
358   /**
359    * Create a new HConnection instance using the passed <code>conf</code> instance.
360    * <p>Note: This bypasses the usual HConnection life cycle management done by
361    * {@link #getConnection(Configuration)}. The caller is responsible for
362    * calling {@link HConnection#close()} on the returned connection instance.
363    * This is the recommended way to create HConnections.
364    * {@code
365    * ExecutorService pool = ...;
366    * HConnection connection = HConnectionManager.createConnection(conf, pool);
367    * HTableInterface table = connection.getTable("mytable");
368    * table.get(...);
369    * ...
370    * table.close();
371    * connection.close();
372    * }
373    * @param conf configuration
374    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
375    * @param user the user the connection is for
376    * @return HConnection object for <code>conf</code>
377    * @throws ZooKeeperConnectionException
378    */
379   public static HConnection createConnection(Configuration conf, ExecutorService pool, User user)
380   throws IOException {
381     return createConnection(conf, false, pool, user);
382   }
383 
384   @Deprecated
385   static HConnection createConnection(final Configuration conf, final boolean managed)
386       throws IOException {
387     UserProvider provider = UserProvider.instantiate(conf);
388     return createConnection(conf, managed, null, provider.getCurrent());
389   }
390 
391   @Deprecated
392   static HConnection createConnection(final Configuration conf, final boolean managed,
393       final ExecutorService pool, final User user)
394   throws IOException {
395     String className = conf.get("hbase.client.connection.impl",
396       HConnectionManager.HConnectionImplementation.class.getName());
397     Class<?> clazz = null;
398     try {
399       clazz = Class.forName(className);
400     } catch (ClassNotFoundException e) {
401       throw new IOException(e);
402     }
403     try {
404       // Default HCM#HCI is not accessible; make it so before invoking.
405       Constructor<?> constructor =
406         clazz.getDeclaredConstructor(Configuration.class,
407           boolean.class, ExecutorService.class, User.class);
408       constructor.setAccessible(true);
409       return (HConnection) constructor.newInstance(conf, managed, pool, user);
410     } catch (Exception e) {
411       throw new IOException(e);
412     }
413   }
414 
415   /**
416    * Delete connection information for the instance specified by passed configuration.
417    * If there are no more references to the designated connection connection, this method will
418    * then close connection to the zookeeper ensemble and let go of all associated resources.
419    *
420    * @param conf configuration whose identity is used to find {@link HConnection} instance.
421    * @deprecated
422    */
423   public static void deleteConnection(Configuration conf) {
424     deleteConnection(new HConnectionKey(conf), false);
425   }
426 
427   /**
428    * Cleanup a known stale connection.
429    * This will then close connection to the zookeeper ensemble and let go of all resources.
430    *
431    * @param connection
432    * @deprecated
433    */
434   public static void deleteStaleConnection(HConnection connection) {
435     deleteConnection(connection, true);
436   }
437 
438   /**
439    * Delete information for all connections. Close or not the connection, depending on the
440    *  staleConnection boolean and the ref count. By default, you should use it with
441    *  staleConnection to true.
442    * @deprecated
443    */
444   public static void deleteAllConnections(boolean staleConnection) {
445     synchronized (CONNECTION_INSTANCES) {
446       Set<HConnectionKey> connectionKeys = new HashSet<HConnectionKey>();
447       connectionKeys.addAll(CONNECTION_INSTANCES.keySet());
448       for (HConnectionKey connectionKey : connectionKeys) {
449         deleteConnection(connectionKey, staleConnection);
450       }
451       CONNECTION_INSTANCES.clear();
452     }
453   }
454 
455   /**
456    * Delete information for all connections..
457    * @deprecated kept for backward compatibility, but the behavior is broken. HBASE-8983
458    */
459   @Deprecated
460   public static void deleteAllConnections() {
461     deleteAllConnections(false);
462   }
463 
464 
465   @Deprecated
466   private static void deleteConnection(HConnection connection, boolean staleConnection) {
467     synchronized (CONNECTION_INSTANCES) {
468       for (Entry<HConnectionKey, HConnectionImplementation> e: CONNECTION_INSTANCES.entrySet()) {
469         if (e.getValue() == connection) {
470           deleteConnection(e.getKey(), staleConnection);
471           break;
472         }
473       }
474     }
475   }
476 
477   @Deprecated
478   private static void deleteConnection(HConnectionKey connectionKey, boolean staleConnection) {
479     synchronized (CONNECTION_INSTANCES) {
480       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
481       if (connection != null) {
482         connection.decCount();
483         if (connection.isZeroReference() || staleConnection) {
484           CONNECTION_INSTANCES.remove(connectionKey);
485           connection.internalClose();
486         }
487       } else {
488         LOG.error("Connection not found in the list, can't delete it "+
489           "(connection key=" + connectionKey + "). May be the key was modified?", new Exception());
490       }
491     }
492   }
493 
494   /**
495    * It is provided for unit test cases which verify the behavior of region
496    * location cache prefetch.
497    * @return Number of cached regions for the table.
498    * @throws ZooKeeperConnectionException
499    */
500   static int getCachedRegionCount(Configuration conf, final TableName tableName)
501   throws IOException {
502     return execute(new HConnectable<Integer>(conf) {
503       @Override
504       public Integer connect(HConnection connection) {
505         return ((HConnectionImplementation)connection).getNumberOfCachedRegionLocations(tableName);
506       }
507     });
508   }
509 
510   /**
511    * This convenience method invokes the given {@link HConnectable#connect}
512    * implementation using a {@link HConnection} instance that lasts just for the
513    * duration of the invocation.
514    *
515    * @param <T> the return type of the connect method
516    * @param connectable the {@link HConnectable} instance
517    * @return the value returned by the connect method
518    * @throws IOException
519    */
520   @InterfaceAudience.Private
521   public static <T> T execute(HConnectable<T> connectable) throws IOException {
522     if (connectable == null || connectable.conf == null) {
523       return null;
524     }
525     Configuration conf = connectable.conf;
526     HConnection connection = HConnectionManager.getConnection(conf);
527     boolean connectSucceeded = false;
528     try {
529       T returnValue = connectable.connect(connection);
530       connectSucceeded = true;
531       return returnValue;
532     } finally {
533       try {
534         connection.close();
535       } catch (Exception e) {
536         ExceptionUtil.rethrowIfInterrupt(e);
537         if (connectSucceeded) {
538           throw new IOException("The connection to " + connection
539               + " could not be deleted.", e);
540         }
541       }
542     }
543   }
544 
545   /** Encapsulates connection to zookeeper and regionservers.*/
546   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
547       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
548       justification="Access to the conncurrent hash map is under a lock so should be fine.")
549   public static class HConnectionImplementation implements HConnection, Closeable {
550     static final Log LOG = LogFactory.getLog(HConnectionImplementation.class);
551     private final long pause;
552     private final int numTries;
553     final int rpcTimeout;
554     private NonceGenerator nonceGenerator = null;
555     private final boolean usePrefetch;
556     private final int prefetchRegionLimit;
557 
558     private volatile boolean closed;
559     private volatile boolean aborted;
560 
561     // package protected for the tests
562     ClusterStatusListener clusterStatusListener;
563 
564     private final Object userRegionLock = new Object();
565 
566     // We have a single lock for master & zk to prevent deadlocks. Having
567     //  one lock for ZK and one lock for master is not possible:
568     //  When creating a connection to master, we need a connection to ZK to get
569     //  its address. But another thread could have taken the ZK lock, and could
570     //  be waiting for the master lock => deadlock.
571     private final Object masterAndZKLock = new Object();
572 
573     private long keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
574     private final DelayedClosing delayedClosing =
575       DelayedClosing.createAndStart(this);
576 
577     // thread executor shared by all HTableInterface instances created
578     // by this connection
579     private volatile ExecutorService batchPool = null;
580     private volatile boolean cleanupPool = false;
581 
582     private final Configuration conf;
583 
584     // Client rpc instance.
585     private RpcClient rpcClient;
586 
587     /**
588       * Map of table to table {@link HRegionLocation}s.
589       */
590     private final ConcurrentMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>
591         cachedRegionLocations =
592       new ConcurrentHashMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>();
593 
594     // The presence of a server in the map implies it's likely that there is an
595     // entry in cachedRegionLocations that map to this server; but the absence
596     // of a server in this map guarentees that there is no entry in cache that
597     // maps to the absent server.
598     // The access to this attribute must be protected by a lock on cachedRegionLocations
599     private final Set<ServerName> cachedServers = new ConcurrentSkipListSet<ServerName>();
600 
601     // region cache prefetch is enabled by default. this set contains all
602     // tables whose region cache prefetch are disabled.
603     private final Set<Integer> regionCachePrefetchDisabledTables =
604       new CopyOnWriteArraySet<Integer>();
605 
606     private int refCount;
607 
608     // indicates whether this connection's life cycle is managed (by us)
609     private boolean managed;
610 
611     private User user;
612 
613     /**
614      * Cluster registry of basic info such as clusterid and meta region location.
615      */
616      Registry registry;
617 
618      HConnectionImplementation(Configuration conf, boolean managed) throws IOException {
619        this(conf, managed, null, null);
620      }
621 
622     /**
623      * constructor
624      * @param conf Configuration object
625      * @param managed If true, does not do full shutdown on close; i.e. cleanup of connection
626      * to zk and shutdown of all services; we just close down the resources this connection was
627      * responsible for and decrement usage counters.  It is up to the caller to do the full
628      * cleanup.  It is set when we want have connection sharing going on -- reuse of zk connection,
629      * and cached region locations, established regionserver connections, etc.  When connections
630      * are shared, we have reference counting going on and will only do full cleanup when no more
631      * users of an HConnectionImplementation instance.
632      */
633     HConnectionImplementation(Configuration conf, boolean managed,
634         ExecutorService pool, User user) throws IOException {
635       this(conf);
636       this.user = user;
637       this.batchPool = pool;
638       this.managed = managed;
639       this.registry = setupRegistry();
640       retrieveClusterId();
641 
642       this.rpcClient = new RpcClient(this.conf, this.clusterId);
643 
644       // Do we publish the status?
645       boolean shouldListen = conf.getBoolean(HConstants.STATUS_PUBLISHED,
646           HConstants.STATUS_PUBLISHED_DEFAULT);
647       Class<? extends ClusterStatusListener.Listener> listenerClass =
648           conf.getClass(ClusterStatusListener.STATUS_LISTENER_CLASS,
649               ClusterStatusListener.DEFAULT_STATUS_LISTENER_CLASS,
650               ClusterStatusListener.Listener.class);
651       if (shouldListen) {
652         if (listenerClass == null) {
653           LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
654               ClusterStatusListener.STATUS_LISTENER_CLASS + " is not set - not listening status");
655         } else {
656           clusterStatusListener = new ClusterStatusListener(
657               new ClusterStatusListener.DeadServerHandler() {
658                 @Override
659                 public void newDead(ServerName sn) {
660                   clearCaches(sn);
661                   rpcClient.cancelConnections(sn.getHostname(), sn.getPort(),
662                       new SocketException(sn.getServerName() +
663                           " is dead: closing its connection."));
664                 }
665               }, conf, listenerClass);
666         }
667       }
668     }
669 
670     /** Dummy nonce generator for disabled nonces. */
671     private static class NoNonceGenerator implements NonceGenerator {
672       @Override
673       public long getNonceGroup() {
674         return HConstants.NO_NONCE;
675       }
676       @Override
677       public long newNonce() {
678         return HConstants.NO_NONCE;
679       }
680     }
681 
682     /**
683      * For tests.
684      */
685     protected HConnectionImplementation(Configuration conf) {
686       this.conf = conf;
687       this.closed = false;
688       this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
689           HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
690       this.numTries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
691           HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
692       this.rpcTimeout = conf.getInt(
693           HConstants.HBASE_RPC_TIMEOUT_KEY,
694           HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
695       if (conf.getBoolean(CLIENT_NONCES_ENABLED_KEY, true)) {
696         synchronized (HConnectionManager.nonceGeneratorCreateLock) {
697           if (HConnectionManager.nonceGenerator == null) {
698             HConnectionManager.nonceGenerator = new PerClientRandomNonceGenerator();
699           }
700           this.nonceGenerator = HConnectionManager.nonceGenerator;
701         }
702       } else {
703         this.nonceGenerator = new NoNonceGenerator();
704       }
705 
706       this.usePrefetch = conf.getBoolean(HConstants.HBASE_CLIENT_PREFETCH,
707           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH);
708       this.prefetchRegionLimit = conf.getInt(
709           HConstants.HBASE_CLIENT_PREFETCH_LIMIT,
710           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH_LIMIT);
711     }
712 
713     @Override
714     public HTableInterface getTable(String tableName) throws IOException {
715       return getTable(TableName.valueOf(tableName));
716     }
717 
718     @Override
719     public HTableInterface getTable(byte[] tableName) throws IOException {
720       return getTable(TableName.valueOf(tableName));
721     }
722 
723     @Override
724     public HTableInterface getTable(TableName tableName) throws IOException {
725       return getTable(tableName, getBatchPool());
726     }
727 
728     @Override
729     public HTableInterface getTable(String tableName, ExecutorService pool) throws IOException {
730       return getTable(TableName.valueOf(tableName), pool);
731     }
732 
733     @Override
734     public HTableInterface getTable(byte[] tableName, ExecutorService pool) throws IOException {
735       return getTable(TableName.valueOf(tableName), pool);
736     }
737 
738     @Override
739     public HTableInterface getTable(TableName tableName, ExecutorService pool) throws IOException {
740       if (managed) {
741         throw new IOException("The connection has to be unmanaged.");
742       }
743       return new HTable(tableName, this, pool);
744     }
745 
746     private ExecutorService getBatchPool() {
747       if (batchPool == null) {
748         // shared HTable thread executor not yet initialized
749         synchronized (this) {
750           if (batchPool == null) {
751             int maxThreads = conf.getInt("hbase.hconnection.threads.max", 256);
752             int coreThreads = conf.getInt("hbase.hconnection.threads.core", 256);
753             if (maxThreads == 0) {
754               maxThreads = Runtime.getRuntime().availableProcessors() * 8;
755             }
756             if (coreThreads == 0) {
757               coreThreads = Runtime.getRuntime().availableProcessors() * 8;
758             }
759             long keepAliveTime = conf.getLong("hbase.hconnection.threads.keepalivetime", 60);
760             LinkedBlockingQueue<Runnable> workQueue =
761               new LinkedBlockingQueue<Runnable>(maxThreads *
762                 conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
763                   HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
764             ThreadPoolExecutor tpe = new ThreadPoolExecutor(
765                 coreThreads,
766                 maxThreads,
767                 keepAliveTime,
768                 TimeUnit.SECONDS,
769                 workQueue,
770                 Threads.newDaemonThreadFactory(toString() + "-shared-"));
771             tpe.allowCoreThreadTimeOut(true);
772             this.batchPool = tpe;
773           }
774           this.cleanupPool = true;
775         }
776       }
777       return this.batchPool;
778     }
779 
780     protected ExecutorService getCurrentBatchPool() {
781       return batchPool;
782     }
783 
784     private void shutdownBatchPool() {
785       if (this.cleanupPool && this.batchPool != null && !this.batchPool.isShutdown()) {
786         this.batchPool.shutdown();
787         try {
788           if (!this.batchPool.awaitTermination(10, TimeUnit.SECONDS)) {
789             this.batchPool.shutdownNow();
790           }
791         } catch (InterruptedException e) {
792           this.batchPool.shutdownNow();
793         }
794       }
795     }
796 
797     /**
798      * @return The cluster registry implementation to use.
799      * @throws IOException
800      */
801     private Registry setupRegistry() throws IOException {
802       String registryClass = this.conf.get("hbase.client.registry.impl",
803         ZooKeeperRegistry.class.getName());
804       Registry registry = null;
805       try {
806         registry = (Registry)Class.forName(registryClass).newInstance();
807       } catch (Throwable t) {
808         throw new IOException(t);
809       }
810       registry.init(this);
811       return registry;
812     }
813 
814     /**
815      * For tests only.
816      * @param rpcClient Client we should use instead.
817      * @return Previous rpcClient
818      */
819     RpcClient setRpcClient(final RpcClient rpcClient) {
820       RpcClient oldRpcClient = this.rpcClient;
821       this.rpcClient = rpcClient;
822       return oldRpcClient;
823     }
824 
825     /**
826      * An identifier that will remain the same for a given connection.
827      * @return
828      */
829     public String toString(){
830       return "hconnection-0x" + Integer.toHexString(hashCode());
831     }
832 
833     protected String clusterId = null;
834 
835     void retrieveClusterId() {
836       if (clusterId != null) return;
837       this.clusterId = this.registry.getClusterId();
838       if (clusterId == null) {
839         clusterId = HConstants.CLUSTER_ID_DEFAULT;
840         LOG.debug("clusterid came back null, using default " + clusterId);
841       }
842     }
843 
844     @Override
845     public Configuration getConfiguration() {
846       return this.conf;
847     }
848 
849     private void checkIfBaseNodeAvailable(ZooKeeperWatcher zkw)
850       throws MasterNotRunningException {
851       String errorMsg;
852       try {
853         if (ZKUtil.checkExists(zkw, zkw.baseZNode) == -1) {
854           errorMsg = "The node " + zkw.baseZNode+" is not in ZooKeeper. "
855             + "It should have been written by the master. "
856             + "Check the value configured in 'zookeeper.znode.parent'. "
857             + "There could be a mismatch with the one configured in the master.";
858           LOG.error(errorMsg);
859           throw new MasterNotRunningException(errorMsg);
860         }
861       } catch (KeeperException e) {
862         errorMsg = "Can't get connection to ZooKeeper: " + e.getMessage();
863         LOG.error(errorMsg);
864         throw new MasterNotRunningException(errorMsg, e);
865       }
866     }
867 
868     /**
869      * @return true if the master is running, throws an exception otherwise
870      * @throws MasterNotRunningException - if the master is not running
871      * @throws ZooKeeperConnectionException
872      */
873     @Override
874     public boolean isMasterRunning()
875     throws MasterNotRunningException, ZooKeeperConnectionException {
876       // When getting the master connection, we check it's running,
877       // so if there is no exception, it means we've been able to get a
878       // connection on a running master
879       MasterKeepAliveConnection m = getKeepAliveMasterService();
880       m.close();
881       return true;
882     }
883 
884     @Override
885     public HRegionLocation getRegionLocation(final TableName tableName,
886         final byte [] row, boolean reload)
887     throws IOException {
888       return reload? relocateRegion(tableName, row): locateRegion(tableName, row);
889     }
890 
891     @Override
892     public HRegionLocation getRegionLocation(final byte[] tableName,
893         final byte [] row, boolean reload)
894     throws IOException {
895       return getRegionLocation(TableName.valueOf(tableName), row, reload);
896     }
897 
898     @Override
899     public boolean isTableEnabled(TableName tableName) throws IOException {
900       return this.registry.isTableOnlineState(tableName, true);
901     }
902 
903     @Override
904     public boolean isTableEnabled(byte[] tableName) throws IOException {
905       return isTableEnabled(TableName.valueOf(tableName));
906     }
907 
908     @Override
909     public boolean isTableDisabled(TableName tableName) throws IOException {
910       return this.registry.isTableOnlineState(tableName, false);
911     }
912 
913     @Override
914     public boolean isTableDisabled(byte[] tableName) throws IOException {
915       return isTableDisabled(TableName.valueOf(tableName));
916     }
917 
918     @Override
919     public boolean isTableAvailable(final TableName tableName) throws IOException {
920       final AtomicBoolean available = new AtomicBoolean(true);
921       final AtomicInteger regionCount = new AtomicInteger(0);
922       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
923         @Override
924         public boolean processRow(Result row) throws IOException {
925           HRegionInfo info = MetaScanner.getHRegionInfo(row);
926           if (info != null && !info.isSplitParent()) {
927             if (tableName.equals(info.getTable())) {
928               ServerName server = HRegionInfo.getServerName(row);
929               if (server == null) {
930                 available.set(false);
931                 return false;
932               }
933               regionCount.incrementAndGet();
934             } else if (tableName.compareTo(info.getTable()) < 0) {
935               // Return if we are done with the current table
936               return false;
937             }
938           }
939           return true;
940         }
941       };
942       MetaScanner.metaScan(conf, this, visitor, tableName);
943       return available.get() && (regionCount.get() > 0);
944     }
945 
946     @Override
947     public boolean isTableAvailable(final byte[] tableName) throws IOException {
948       return isTableAvailable(TableName.valueOf(tableName));
949     }
950 
951     @Override
952     public boolean isTableAvailable(final TableName tableName, final byte[][] splitKeys)
953         throws IOException {
954       final AtomicBoolean available = new AtomicBoolean(true);
955       final AtomicInteger regionCount = new AtomicInteger(0);
956       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
957         @Override
958         public boolean processRow(Result row) throws IOException {
959           HRegionInfo info = MetaScanner.getHRegionInfo(row);
960           if (info != null && !info.isSplitParent()) {
961             if (tableName.equals(info.getTable())) {
962               ServerName server = HRegionInfo.getServerName(row);
963               if (server == null) {
964                 available.set(false);
965                 return false;
966               }
967               if (!Bytes.equals(info.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
968                 for (byte[] splitKey : splitKeys) {
969                   // Just check if the splitkey is available
970                   if (Bytes.equals(info.getStartKey(), splitKey)) {
971                     regionCount.incrementAndGet();
972                     break;
973                   }
974                 }
975               } else {
976                 // Always empty start row should be counted
977                 regionCount.incrementAndGet();
978               }
979             } else if (tableName.compareTo(info.getTable()) < 0) {
980               // Return if we are done with the current table
981               return false;
982             }
983           }
984           return true;
985         }
986       };
987       MetaScanner.metaScan(conf, this, visitor, tableName);
988       // +1 needs to be added so that the empty start row is also taken into account
989       return available.get() && (regionCount.get() == splitKeys.length + 1);
990     }
991 
992     @Override
993     public boolean isTableAvailable(final byte[] tableName, final byte[][] splitKeys)
994         throws IOException {
995       return isTableAvailable(TableName.valueOf(tableName), splitKeys);
996     }
997 
998     @Override
999     public HRegionLocation locateRegion(final byte[] regionName) throws IOException {
1000       return locateRegion(HRegionInfo.getTable(regionName),
1001           HRegionInfo.getStartKey(regionName), false, true);
1002     }
1003 
1004     @Override
1005     public boolean isDeadServer(ServerName sn) {
1006       if (clusterStatusListener == null) {
1007         return false;
1008       } else {
1009         return clusterStatusListener.isDeadServer(sn);
1010       }
1011     }
1012 
1013     @Override
1014     public List<HRegionLocation> locateRegions(final TableName tableName)
1015     throws IOException {
1016       return locateRegions (tableName, false, true);
1017     }
1018 
1019     @Override
1020     public List<HRegionLocation> locateRegions(final byte[] tableName)
1021     throws IOException {
1022       return locateRegions(TableName.valueOf(tableName));
1023     }
1024 
1025     @Override
1026     public List<HRegionLocation> locateRegions(final TableName tableName,
1027         final boolean useCache, final boolean offlined) throws IOException {
1028       NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(conf, this,
1029           tableName, offlined);
1030       final List<HRegionLocation> locations = new ArrayList<HRegionLocation>();
1031       for (HRegionInfo regionInfo : regions.keySet()) {
1032         locations.add(locateRegion(tableName, regionInfo.getStartKey(), useCache, true));
1033       }
1034       return locations;
1035     }
1036 
1037     @Override
1038     public List<HRegionLocation> locateRegions(final byte[] tableName,
1039        final boolean useCache, final boolean offlined) throws IOException {
1040       return locateRegions(TableName.valueOf(tableName), useCache, offlined);
1041     }
1042 
1043     @Override
1044     public HRegionLocation locateRegion(final TableName tableName,
1045         final byte [] row)
1046     throws IOException{
1047       return locateRegion(tableName, row, true, true);
1048     }
1049 
1050     @Override
1051     public HRegionLocation locateRegion(final byte[] tableName,
1052         final byte [] row)
1053     throws IOException{
1054       return locateRegion(TableName.valueOf(tableName), row);
1055     }
1056 
1057     @Override
1058     public HRegionLocation relocateRegion(final TableName tableName,
1059         final byte [] row) throws IOException{
1060       // Since this is an explicit request not to use any caching, finding
1061       // disabled tables should not be desirable.  This will ensure that an exception is thrown when
1062       // the first time a disabled table is interacted with.
1063       if (isTableDisabled(tableName)) {
1064         throw new TableNotEnabledException(tableName.getNameAsString() + " is disabled.");
1065       }
1066 
1067       return locateRegion(tableName, row, false, true);
1068     }
1069 
1070     @Override
1071     public HRegionLocation relocateRegion(final byte[] tableName,
1072         final byte [] row) throws IOException {
1073       return relocateRegion(TableName.valueOf(tableName), row);
1074     }
1075 
1076 
1077     private HRegionLocation locateRegion(final TableName tableName,
1078       final byte [] row, boolean useCache, boolean retry)
1079     throws IOException {
1080       if (this.closed) throw new IOException(toString() + " closed");
1081       if (tableName== null || tableName.getName().length == 0) {
1082         throw new IllegalArgumentException(
1083             "table name cannot be null or zero length");
1084       }
1085 
1086       if (tableName.equals(TableName.META_TABLE_NAME)) {
1087         return this.registry.getMetaRegionLocation();
1088       } else {
1089         // Region not in the cache - have to go to the meta RS
1090         return locateRegionInMeta(TableName.META_TABLE_NAME, tableName, row,
1091           useCache, userRegionLock, retry);
1092       }
1093     }
1094 
1095     /*
1096      * Search hbase:meta for the HRegionLocation info that contains the table and
1097      * row we're seeking. It will prefetch certain number of regions info and
1098      * save them to the global region cache.
1099      */
1100     private void prefetchRegionCache(final TableName tableName,
1101         final byte[] row) {
1102       // Implement a new visitor for MetaScanner, and use it to walk through
1103       // the hbase:meta
1104       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1105         public boolean processRow(Result result) throws IOException {
1106           try {
1107             HRegionInfo regionInfo = MetaScanner.getHRegionInfo(result);
1108             if (regionInfo == null) {
1109               return true;
1110             }
1111 
1112             // possible we got a region of a different table...
1113             if (!regionInfo.getTable().equals(tableName)) {
1114               return false; // stop scanning
1115             }
1116             if (regionInfo.isOffline()) {
1117               // don't cache offline regions
1118               return true;
1119             }
1120 
1121             ServerName serverName = HRegionInfo.getServerName(result);
1122             if (serverName == null) {
1123               return true; // don't cache it
1124             }
1125             // instantiate the location
1126             long seqNum = HRegionInfo.getSeqNumDuringOpen(result);
1127             HRegionLocation loc = new HRegionLocation(regionInfo, serverName, seqNum);
1128             // cache this meta entry
1129             cacheLocation(tableName, null, loc);
1130             return true;
1131           } catch (RuntimeException e) {
1132             throw new IOException(e);
1133           }
1134         }
1135       };
1136       try {
1137         // pre-fetch certain number of regions info at region cache.
1138         MetaScanner.metaScan(conf, this, visitor, tableName, row,
1139             this.prefetchRegionLimit, TableName.META_TABLE_NAME);
1140       } catch (IOException e) {
1141         if (ExceptionUtil.isInterrupt(e)) {
1142           Thread.currentThread().interrupt();
1143         }
1144       }
1145     }
1146 
1147     /*
1148       * Search the hbase:meta table for the HRegionLocation
1149       * info that contains the table and row we're seeking.
1150       */
1151     private HRegionLocation locateRegionInMeta(final TableName parentTable,
1152       final TableName tableName, final byte [] row, boolean useCache,
1153       Object regionLockObject, boolean retry)
1154     throws IOException {
1155       HRegionLocation location;
1156       // If we are supposed to be using the cache, look in the cache to see if
1157       // we already have the region.
1158       if (useCache) {
1159         location = getCachedLocation(tableName, row);
1160         if (location != null) {
1161           return location;
1162         }
1163       }
1164       int localNumRetries = retry ? numTries : 1;
1165       // build the key of the meta region we should be looking for.
1166       // the extra 9's on the end are necessary to allow "exact" matches
1167       // without knowing the precise region names.
1168       byte [] metaKey = HRegionInfo.createRegionName(tableName, row,
1169         HConstants.NINES, false);
1170       for (int tries = 0; true; tries++) {
1171         if (tries >= localNumRetries) {
1172           throw new NoServerForRegionException("Unable to find region for "
1173             + Bytes.toStringBinary(row) + " after " + numTries + " tries.");
1174         }
1175 
1176         HRegionLocation metaLocation = null;
1177         try {
1178           // locate the meta region
1179           metaLocation = locateRegion(parentTable, metaKey, true, false);
1180           // If null still, go around again.
1181           if (metaLocation == null) continue;
1182           ClientService.BlockingInterface service = getClient(metaLocation.getServerName());
1183 
1184           Result regionInfoRow;
1185           // This block guards against two threads trying to load the meta
1186           // region at the same time. The first will load the meta region and
1187           // the second will use the value that the first one found.
1188           if (useCache) {
1189             if (TableName.META_TABLE_NAME.equals(parentTable) && usePrefetch &&
1190                 getRegionCachePrefetch(tableName)) {
1191               synchronized (regionLockObject) {
1192                 // Check the cache again for a hit in case some other thread made the
1193                 // same query while we were waiting on the lock.
1194                 location = getCachedLocation(tableName, row);
1195                 if (location != null) {
1196                   return location;
1197                 }
1198                 // If the parent table is META, we may want to pre-fetch some
1199                 // region info into the global region cache for this table.
1200                 prefetchRegionCache(tableName, row);
1201               }
1202             }
1203             location = getCachedLocation(tableName, row);
1204             if (location != null) {
1205               return location;
1206             }
1207           } else {
1208             // If we are not supposed to be using the cache, delete any existing cached location
1209             // so it won't interfere.
1210             forceDeleteCachedLocation(tableName, row);
1211           }
1212 
1213           // Query the meta region for the location of the meta region
1214           regionInfoRow =
1215               ProtobufUtil.getRowOrBefore(service, metaLocation.getRegionInfo().getRegionName(),
1216                 metaKey, HConstants.CATALOG_FAMILY);
1217 
1218           if (regionInfoRow == null) {
1219             throw new TableNotFoundException(tableName);
1220           }
1221 
1222           // convert the row result into the HRegionLocation we need!
1223           HRegionInfo regionInfo = MetaScanner.getHRegionInfo(regionInfoRow);
1224           if (regionInfo == null) {
1225             throw new IOException("HRegionInfo was null or empty in " +
1226               parentTable + ", row=" + regionInfoRow);
1227           }
1228 
1229           // possible we got a region of a different table...
1230           if (!regionInfo.getTable().equals(tableName)) {
1231             throw new TableNotFoundException(
1232                   "Table '" + tableName + "' was not found, got: " +
1233                   regionInfo.getTable() + ".");
1234           }
1235           if (regionInfo.isSplit()) {
1236             throw new RegionOfflineException("the only available region for" +
1237               " the required row is a split parent," +
1238               " the daughters should be online soon: " +
1239               regionInfo.getRegionNameAsString());
1240           }
1241           if (regionInfo.isOffline()) {
1242             throw new RegionOfflineException("the region is offline, could" +
1243               " be caused by a disable table call: " +
1244               regionInfo.getRegionNameAsString());
1245           }
1246 
1247           ServerName serverName = HRegionInfo.getServerName(regionInfoRow);
1248           if (serverName == null) {
1249             throw new NoServerForRegionException("No server address listed " +
1250               "in " + parentTable + " for region " +
1251               regionInfo.getRegionNameAsString() + " containing row " +
1252               Bytes.toStringBinary(row));
1253           }
1254 
1255           if (isDeadServer(serverName)){
1256             throw new RegionServerStoppedException("hbase:meta says the region "+
1257                 regionInfo.getRegionNameAsString()+" is managed by the server " + serverName +
1258                 ", but it is dead.");
1259           }
1260 
1261           // Instantiate the location
1262           location = new HRegionLocation(regionInfo, serverName,
1263             HRegionInfo.getSeqNumDuringOpen(regionInfoRow));
1264           cacheLocation(tableName, null, location);
1265           return location;
1266         } catch (TableNotFoundException e) {
1267           // if we got this error, probably means the table just plain doesn't
1268           // exist. rethrow the error immediately. this should always be coming
1269           // from the HTable constructor.
1270           throw e;
1271         } catch (IOException e) {
1272           ExceptionUtil.rethrowIfInterrupt(e);
1273 
1274           if (e instanceof RemoteException) {
1275             e = ((RemoteException)e).unwrapRemoteException();
1276           }
1277           if (tries < numTries - 1) {
1278             if (LOG.isDebugEnabled()) {
1279               LOG.debug("locateRegionInMeta parentTable=" +
1280                 parentTable + ", metaLocation=" +
1281                 ((metaLocation == null)? "null": "{" + metaLocation + "}") +
1282                 ", attempt=" + tries + " of " +
1283                 this.numTries + " failed; retrying after sleep of " +
1284                 ConnectionUtils.getPauseTime(this.pause, tries) + " because: " + e.getMessage());
1285             }
1286           } else {
1287             throw e;
1288           }
1289           // Only relocate the parent region if necessary
1290           if(!(e instanceof RegionOfflineException ||
1291               e instanceof NoServerForRegionException)) {
1292             relocateRegion(parentTable, metaKey);
1293           }
1294         }
1295         try{
1296           Thread.sleep(ConnectionUtils.getPauseTime(this.pause, tries));
1297         } catch (InterruptedException e) {
1298           throw new InterruptedIOException("Giving up trying to location region in " +
1299             "meta: thread is interrupted.");
1300         }
1301       }
1302     }
1303 
1304     /*
1305      * Search the cache for a location that fits our table and row key.
1306      * Return null if no suitable region is located.
1307      *
1308      * @param tableName
1309      * @param row
1310      * @return Null or region location found in cache.
1311      */
1312     HRegionLocation getCachedLocation(final TableName tableName,
1313         final byte [] row) {
1314       ConcurrentSkipListMap<byte[], HRegionLocation> tableLocations =
1315         getTableLocations(tableName);
1316 
1317       Entry<byte[], HRegionLocation> e = tableLocations.floorEntry(row);
1318       if (e == null) {
1319         return null;
1320       }
1321       HRegionLocation possibleRegion = e.getValue();
1322 
1323       // make sure that the end key is greater than the row we're looking
1324       // for, otherwise the row actually belongs in the next region, not
1325       // this one. the exception case is when the endkey is
1326       // HConstants.EMPTY_END_ROW, signifying that the region we're
1327       // checking is actually the last region in the table.
1328       byte[] endKey = possibleRegion.getRegionInfo().getEndKey();
1329       if (Bytes.equals(endKey, HConstants.EMPTY_END_ROW) ||
1330           tableName.getRowComparator().compareRows(
1331               endKey, 0, endKey.length, row, 0, row.length) > 0) {
1332         return possibleRegion;
1333       }
1334 
1335       // Passed all the way through, so we got nothing - complete cache miss
1336       return null;
1337     }
1338 
1339     /**
1340      * Delete a cached location, no matter what it is. Called when we were told to not use cache.
1341      * @param tableName tableName
1342      * @param row
1343      */
1344     void forceDeleteCachedLocation(final TableName tableName, final byte [] row) {
1345       HRegionLocation rl = null;
1346       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1347       // start to examine the cache. we can only do cache actions
1348       // if there's something in the cache for this table.
1349       rl = getCachedLocation(tableName, row);
1350       if (rl != null) {
1351         tableLocations.remove(rl.getRegionInfo().getStartKey());
1352       }
1353       if ((rl != null) && LOG.isDebugEnabled()) {
1354         LOG.debug("Removed " + rl.getHostname() + ":" + rl.getPort()
1355           + " as a location of " + rl.getRegionInfo().getRegionNameAsString() +
1356           " for tableName=" + tableName + " from cache");
1357       }
1358     }
1359 
1360     /*
1361      * Delete all cached entries of a table that maps to a specific location.
1362      */
1363     @Override
1364     public void clearCaches(final ServerName serverName) {
1365       if (!this.cachedServers.contains(serverName)) {
1366         return;
1367       }
1368 
1369       boolean deletedSomething = false;
1370       synchronized (this.cachedServers) {
1371         // We block here, because if there is an error on a server, it's likely that multiple
1372         //  threads will get the error  simultaneously. If there are hundreds of thousand of
1373         //  region location to check, it's better to do this only once. A better pattern would
1374         //  be to check if the server is dead when we get the region location.
1375         if (!this.cachedServers.contains(serverName)) {
1376           return;
1377         }
1378         for (Map<byte[], HRegionLocation> tableLocations : cachedRegionLocations.values()) {
1379           for (Entry<byte[], HRegionLocation> e : tableLocations.entrySet()) {
1380             HRegionLocation value = e.getValue();
1381             if (value != null
1382                 && serverName.equals(value.getServerName())) {
1383               tableLocations.remove(e.getKey());
1384               deletedSomething = true;
1385             }
1386           }
1387         }
1388         this.cachedServers.remove(serverName);
1389       }
1390       if (deletedSomething && LOG.isDebugEnabled()) {
1391         LOG.debug("Removed all cached region locations that map to " + serverName);
1392       }
1393     }
1394 
1395     /*
1396      * @param tableName
1397      * @return Map of cached locations for passed <code>tableName</code>
1398      */
1399     private ConcurrentSkipListMap<byte[], HRegionLocation> getTableLocations(
1400         final TableName tableName) {
1401       // find the map of cached locations for this table
1402       ConcurrentSkipListMap<byte[], HRegionLocation> result;
1403       result = this.cachedRegionLocations.get(tableName);
1404       // if tableLocations for this table isn't built yet, make one
1405       if (result == null) {
1406         result = new ConcurrentSkipListMap<byte[], HRegionLocation>(Bytes.BYTES_COMPARATOR);
1407         ConcurrentSkipListMap<byte[], HRegionLocation> old =
1408             this.cachedRegionLocations.putIfAbsent(tableName, result);
1409         if (old != null) {
1410           return old;
1411         }
1412       }
1413       return result;
1414     }
1415 
1416     @Override
1417     public void clearRegionCache() {
1418       this.cachedRegionLocations.clear();
1419       this.cachedServers.clear();
1420     }
1421 
1422     @Override
1423     public void clearRegionCache(final TableName tableName) {
1424       this.cachedRegionLocations.remove(tableName);
1425     }
1426 
1427     @Override
1428     public void clearRegionCache(final byte[] tableName) {
1429       clearRegionCache(TableName.valueOf(tableName));
1430     }
1431 
1432     /**
1433      * Put a newly discovered HRegionLocation into the cache.
1434      * @param tableName The table name.
1435      * @param source the source of the new location, if it's not coming from meta
1436      * @param location the new location
1437      */
1438     private void cacheLocation(final TableName tableName, final HRegionLocation source,
1439         final HRegionLocation location) {
1440       boolean isFromMeta = (source == null);
1441       byte [] startKey = location.getRegionInfo().getStartKey();
1442       ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1443       HRegionLocation oldLocation = tableLocations.putIfAbsent(startKey, location);
1444       boolean isNewCacheEntry = (oldLocation == null);
1445       if (isNewCacheEntry) {
1446         cachedServers.add(location.getServerName());
1447         return;
1448       }
1449       boolean updateCache;
1450       // If the server in cache sends us a redirect, assume it's always valid.
1451       if (oldLocation.equals(source)) {
1452         updateCache = true;
1453       } else {
1454         long newLocationSeqNum = location.getSeqNum();
1455         // Meta record is stale - some (probably the same) server has closed the region
1456         // with later seqNum and told us about the new location.
1457         boolean isStaleMetaRecord = isFromMeta && (oldLocation.getSeqNum() > newLocationSeqNum);
1458         // Same as above for redirect. However, in this case, if the number is equal to previous
1459         // record, the most common case is that first the region was closed with seqNum, and then
1460         // opened with the same seqNum; hence we will ignore the redirect.
1461         // There are so many corner cases with various combinations of opens and closes that
1462         // an additional counter on top of seqNum would be necessary to handle them all.
1463         boolean isStaleRedirect = !isFromMeta && (oldLocation.getSeqNum() >= newLocationSeqNum);
1464         boolean isStaleUpdate = (isStaleMetaRecord || isStaleRedirect);
1465         updateCache = (!isStaleUpdate);
1466       }
1467       if (updateCache) {
1468         tableLocations.replace(startKey, oldLocation, location);
1469         cachedServers.add(location.getServerName());
1470       }
1471     }
1472 
1473     // Map keyed by service name + regionserver to service stub implementation
1474     private final ConcurrentHashMap<String, Object> stubs =
1475       new ConcurrentHashMap<String, Object>();
1476     // Map of locks used creating service stubs per regionserver.
1477     private final ConcurrentHashMap<String, String> connectionLock =
1478       new ConcurrentHashMap<String, String>();
1479 
1480     /**
1481      * State of the MasterService connection/setup.
1482      */
1483     static class MasterServiceState {
1484       HConnection connection;
1485       MasterService.BlockingInterface stub;
1486       int userCount;
1487       long keepAliveUntil = Long.MAX_VALUE;
1488 
1489       MasterServiceState (final HConnection connection) {
1490         super();
1491         this.connection = connection;
1492       }
1493 
1494       @Override
1495       public String toString() {
1496         return "MasterService";
1497       }
1498 
1499       Object getStub() {
1500         return this.stub;
1501       }
1502 
1503       void clearStub() {
1504         this.stub = null;
1505       }
1506 
1507       boolean isMasterRunning() throws ServiceException {
1508         IsMasterRunningResponse response =
1509           this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1510         return response != null? response.getIsMasterRunning(): false;
1511       }
1512     }
1513 
1514     /**
1515      * Makes a client-side stub for master services. Sub-class to specialize.
1516      * Depends on hosting class so not static.  Exists so we avoid duplicating a bunch of code
1517      * when setting up the MasterMonitorService and MasterAdminService.
1518      */
1519     abstract class StubMaker {
1520       /**
1521        * Returns the name of the service stub being created.
1522        */
1523       protected abstract String getServiceName();
1524 
1525       /**
1526        * Make stub and cache it internal so can be used later doing the isMasterRunning call.
1527        * @param channel
1528        */
1529       protected abstract Object makeStub(final BlockingRpcChannel channel);
1530 
1531       /**
1532        * Once setup, check it works by doing isMasterRunning check.
1533        * @throws ServiceException
1534        */
1535       protected abstract void isMasterRunning() throws ServiceException;
1536 
1537       /**
1538        * Create a stub. Try once only.  It is not typed because there is no common type to
1539        * protobuf services nor their interfaces.  Let the caller do appropriate casting.
1540        * @return A stub for master services.
1541        * @throws IOException
1542        * @throws KeeperException
1543        * @throws ServiceException
1544        */
1545       private Object makeStubNoRetries() throws IOException, KeeperException, ServiceException {
1546         ZooKeeperKeepAliveConnection zkw;
1547         try {
1548           zkw = getKeepAliveZooKeeperWatcher();
1549         } catch (IOException e) {
1550           ExceptionUtil.rethrowIfInterrupt(e);
1551           throw new ZooKeeperConnectionException("Can't connect to ZooKeeper", e);
1552         }
1553         try {
1554           checkIfBaseNodeAvailable(zkw);
1555           ServerName sn = MasterAddressTracker.getMasterAddress(zkw);
1556           if (sn == null) {
1557             String msg = "ZooKeeper available but no active master location found";
1558             LOG.info(msg);
1559             throw new MasterNotRunningException(msg);
1560           }
1561           if (isDeadServer(sn)) {
1562             throw new MasterNotRunningException(sn + " is dead.");
1563           }
1564           // Use the security info interface name as our stub key
1565           String key = getStubKey(getServiceName(), sn.getHostAndPort());
1566           connectionLock.putIfAbsent(key, key);
1567           Object stub = null;
1568           synchronized (connectionLock.get(key)) {
1569             stub = stubs.get(key);
1570             if (stub == null) {
1571               BlockingRpcChannel channel = rpcClient.createBlockingRpcChannel(sn,
1572                 user, rpcTimeout);
1573               stub = makeStub(channel);
1574               isMasterRunning();
1575               stubs.put(key, stub);
1576             }
1577           }
1578           return stub;
1579         } finally {
1580           zkw.close();
1581         }
1582       }
1583 
1584       /**
1585        * Create a stub against the master.  Retry if necessary.
1586        * @return A stub to do <code>intf</code> against the master
1587        * @throws MasterNotRunningException
1588        */
1589       @edu.umd.cs.findbugs.annotations.SuppressWarnings (value="SWL_SLEEP_WITH_LOCK_HELD")
1590       Object makeStub() throws MasterNotRunningException {
1591         // The lock must be at the beginning to prevent multiple master creations
1592         //  (and leaks) in a multithread context
1593         synchronized (masterAndZKLock) {
1594           Exception exceptionCaught = null;
1595           Object stub = null;
1596           int tries = 0;
1597           while (!closed && stub == null) {
1598             tries++;
1599             try {
1600               stub = makeStubNoRetries();
1601             } catch (IOException e) {
1602               exceptionCaught = e;
1603             } catch (KeeperException e) {
1604               exceptionCaught = e;
1605             } catch (ServiceException e) {
1606               exceptionCaught = e;
1607             }
1608 
1609             if (exceptionCaught != null)
1610               // It failed. If it's not the last try, we're going to wait a little
1611               if (tries < numTries && !ExceptionUtil.isInterrupt(exceptionCaught)) {
1612                 // tries at this point is 1 or more; decrement to start from 0.
1613                 long pauseTime = ConnectionUtils.getPauseTime(pause, tries - 1);
1614                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1615                     " failed; retrying after sleep of " + pauseTime + ", exception=" +
1616                   exceptionCaught);
1617 
1618                 try {
1619                   Thread.sleep(pauseTime);
1620                 } catch (InterruptedException e) {
1621                   throw new MasterNotRunningException(
1622                       "Thread was interrupted while trying to connect to master.", e);
1623                 }
1624               } else {
1625                 // Enough tries, we stop now
1626                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1627                     " failed; no more retrying.", exceptionCaught);
1628                 throw new MasterNotRunningException(exceptionCaught);
1629               }
1630           }
1631 
1632           if (stub == null) {
1633             // implies this.closed true
1634             throw new MasterNotRunningException("Connection was closed while trying to get master");
1635           }
1636           return stub;
1637         }
1638       }
1639     }
1640 
1641     /**
1642      * Class to make a MasterServiceStubMaker stub.
1643      */
1644     class MasterServiceStubMaker extends StubMaker {
1645       private MasterService.BlockingInterface stub;
1646       @Override
1647       protected String getServiceName() {
1648         return MasterService.getDescriptor().getName();
1649       }
1650 
1651       @Override
1652       @edu.umd.cs.findbugs.annotations.SuppressWarnings("SWL_SLEEP_WITH_LOCK_HELD")
1653       MasterService.BlockingInterface makeStub() throws MasterNotRunningException {
1654         return (MasterService.BlockingInterface)super.makeStub();
1655       }
1656 
1657       @Override
1658       protected Object makeStub(BlockingRpcChannel channel) {
1659         this.stub = MasterService.newBlockingStub(channel);
1660         return this.stub;
1661       }
1662 
1663       @Override
1664       protected void isMasterRunning() throws ServiceException {
1665         this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1666       }
1667     }
1668 
1669     @Override
1670     public AdminService.BlockingInterface getAdmin(final ServerName serverName)
1671         throws IOException {
1672       return getAdmin(serverName, false);
1673     }
1674 
1675     @Override
1676     // Nothing is done w/ the 'master' parameter.  It is ignored.
1677     public AdminService.BlockingInterface getAdmin(final ServerName serverName,
1678       final boolean master)
1679     throws IOException {
1680       if (isDeadServer(serverName)) {
1681         throw new RegionServerStoppedException(serverName + " is dead.");
1682       }
1683       String key = getStubKey(AdminService.BlockingInterface.class.getName(),
1684         serverName.getHostAndPort());
1685       this.connectionLock.putIfAbsent(key, key);
1686       AdminService.BlockingInterface stub = null;
1687       synchronized (this.connectionLock.get(key)) {
1688         stub = (AdminService.BlockingInterface)this.stubs.get(key);
1689         if (stub == null) {
1690           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(serverName,
1691             user, this.rpcTimeout);
1692           stub = AdminService.newBlockingStub(channel);
1693           this.stubs.put(key, stub);
1694         }
1695       }
1696       return stub;
1697     }
1698 
1699     @Override
1700     public ClientService.BlockingInterface getClient(final ServerName sn)
1701     throws IOException {
1702       if (isDeadServer(sn)) {
1703         throw new RegionServerStoppedException(sn + " is dead.");
1704       }
1705       String key = getStubKey(ClientService.BlockingInterface.class.getName(), sn.getHostAndPort());
1706       this.connectionLock.putIfAbsent(key, key);
1707       ClientService.BlockingInterface stub = null;
1708       synchronized (this.connectionLock.get(key)) {
1709         stub = (ClientService.BlockingInterface)this.stubs.get(key);
1710         if (stub == null) {
1711           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(sn,
1712             user, this.rpcTimeout);
1713           stub = ClientService.newBlockingStub(channel);
1714           // In old days, after getting stub/proxy, we'd make a call.  We are not doing that here.
1715           // Just fail on first actual call rather than in here on setup.
1716           this.stubs.put(key, stub);
1717         }
1718       }
1719       return stub;
1720     }
1721 
1722     static String getStubKey(final String serviceName, final String rsHostnamePort) {
1723       return serviceName + "@" + rsHostnamePort;
1724     }
1725 
1726     private ZooKeeperKeepAliveConnection keepAliveZookeeper;
1727     private AtomicInteger keepAliveZookeeperUserCount = new AtomicInteger(0);
1728     private boolean canCloseZKW = true;
1729 
1730     // keepAlive time, in ms. No reason to make it configurable.
1731     private static final long keepAlive = 5 * 60 * 1000;
1732 
1733     /**
1734      * Retrieve a shared ZooKeeperWatcher. You must close it it once you've have finished with it.
1735      * @return The shared instance. Never returns null.
1736      */
1737     ZooKeeperKeepAliveConnection getKeepAliveZooKeeperWatcher()
1738       throws IOException {
1739       synchronized (masterAndZKLock) {
1740         if (keepAliveZookeeper == null) {
1741           if (this.closed) {
1742             throw new IOException(toString() + " closed");
1743           }
1744           // We don't check that our link to ZooKeeper is still valid
1745           // But there is a retry mechanism in the ZooKeeperWatcher itself
1746           keepAliveZookeeper = new ZooKeeperKeepAliveConnection(conf, this.toString(), this);
1747         }
1748         keepAliveZookeeperUserCount.incrementAndGet();
1749         keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1750         return keepAliveZookeeper;
1751       }
1752     }
1753 
1754     void releaseZooKeeperWatcher(final ZooKeeperWatcher zkw) {
1755       if (zkw == null){
1756         return;
1757       }
1758       synchronized (masterAndZKLock) {
1759         if (keepAliveZookeeperUserCount.decrementAndGet() <= 0 ){
1760           keepZooKeeperWatcherAliveUntil = System.currentTimeMillis() + keepAlive;
1761         }
1762       }
1763     }
1764 
1765     /**
1766      * Creates a Chore thread to check the connections to master & zookeeper
1767      *  and close them when they reach their closing time (
1768      *  {@link MasterServiceState#keepAliveUntil} and
1769      *  {@link #keepZooKeeperWatcherAliveUntil}). Keep alive time is
1770      *  managed by the release functions and the variable {@link #keepAlive}
1771      */
1772     private static class DelayedClosing extends Chore implements Stoppable {
1773       private HConnectionImplementation hci;
1774       Stoppable stoppable;
1775 
1776       private DelayedClosing(
1777         HConnectionImplementation hci, Stoppable stoppable){
1778         super(
1779           "ZooKeeperWatcher and Master delayed closing for connection "+hci,
1780           60*1000, // We check every minutes
1781           stoppable);
1782         this.hci = hci;
1783         this.stoppable = stoppable;
1784       }
1785 
1786       static DelayedClosing createAndStart(HConnectionImplementation hci){
1787         Stoppable stoppable = new Stoppable() {
1788               private volatile boolean isStopped = false;
1789               @Override public void stop(String why) { isStopped = true;}
1790               @Override public boolean isStopped() {return isStopped;}
1791             };
1792 
1793         return new DelayedClosing(hci, stoppable);
1794       }
1795 
1796       protected void closeMasterProtocol(MasterServiceState protocolState) {
1797         if (System.currentTimeMillis() > protocolState.keepAliveUntil) {
1798           hci.closeMasterService(protocolState);
1799           protocolState.keepAliveUntil = Long.MAX_VALUE;
1800         }
1801       }
1802 
1803       @Override
1804       protected void chore() {
1805         synchronized (hci.masterAndZKLock) {
1806           if (hci.canCloseZKW) {
1807             if (System.currentTimeMillis() >
1808               hci.keepZooKeeperWatcherAliveUntil) {
1809 
1810               hci.closeZooKeeperWatcher();
1811               hci.keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1812             }
1813           }
1814           closeMasterProtocol(hci.masterServiceState);
1815           closeMasterProtocol(hci.masterServiceState);
1816         }
1817       }
1818 
1819       @Override
1820       public void stop(String why) {
1821         stoppable.stop(why);
1822       }
1823 
1824       @Override
1825       public boolean isStopped() {
1826         return stoppable.isStopped();
1827       }
1828     }
1829 
1830     private void closeZooKeeperWatcher() {
1831       synchronized (masterAndZKLock) {
1832         if (keepAliveZookeeper != null) {
1833           LOG.info("Closing zookeeper sessionid=0x" +
1834             Long.toHexString(
1835               keepAliveZookeeper.getRecoverableZooKeeper().getSessionId()));
1836           keepAliveZookeeper.internalClose();
1837           keepAliveZookeeper = null;
1838         }
1839         keepAliveZookeeperUserCount.set(0);
1840       }
1841     }
1842 
1843     final MasterServiceState masterServiceState = new MasterServiceState(this);
1844 
1845     @Override
1846     public MasterService.BlockingInterface getMaster() throws MasterNotRunningException {
1847       return getKeepAliveMasterService();
1848     }
1849 
1850     private void resetMasterServiceState(final MasterServiceState mss) {
1851       mss.userCount++;
1852       mss.keepAliveUntil = Long.MAX_VALUE;
1853     }
1854 
1855     @Override
1856     public MasterKeepAliveConnection getKeepAliveMasterService()
1857     throws MasterNotRunningException {
1858       synchronized (masterAndZKLock) {
1859         if (!isKeepAliveMasterConnectedAndRunning(this.masterServiceState)) {
1860           MasterServiceStubMaker stubMaker = new MasterServiceStubMaker();
1861           this.masterServiceState.stub = stubMaker.makeStub();
1862         }
1863         resetMasterServiceState(this.masterServiceState);
1864       }
1865       // Ugly delegation just so we can add in a Close method.
1866       final MasterService.BlockingInterface stub = this.masterServiceState.stub;
1867       return new MasterKeepAliveConnection() {
1868         MasterServiceState mss = masterServiceState;
1869         @Override
1870         public AddColumnResponse addColumn(RpcController controller, AddColumnRequest request)
1871         throws ServiceException {
1872           return stub.addColumn(controller, request);
1873         }
1874 
1875         @Override
1876         public DeleteColumnResponse deleteColumn(RpcController controller,
1877             DeleteColumnRequest request)
1878         throws ServiceException {
1879           return stub.deleteColumn(controller, request);
1880         }
1881 
1882         @Override
1883         public ModifyColumnResponse modifyColumn(RpcController controller,
1884             ModifyColumnRequest request)
1885         throws ServiceException {
1886           return stub.modifyColumn(controller, request);
1887         }
1888 
1889         @Override
1890         public MoveRegionResponse moveRegion(RpcController controller,
1891             MoveRegionRequest request) throws ServiceException {
1892           return stub.moveRegion(controller, request);
1893         }
1894 
1895         @Override
1896         public DispatchMergingRegionsResponse dispatchMergingRegions(
1897             RpcController controller, DispatchMergingRegionsRequest request)
1898             throws ServiceException {
1899           return stub.dispatchMergingRegions(controller, request);
1900         }
1901 
1902         @Override
1903         public AssignRegionResponse assignRegion(RpcController controller,
1904             AssignRegionRequest request) throws ServiceException {
1905           return stub.assignRegion(controller, request);
1906         }
1907 
1908         @Override
1909         public UnassignRegionResponse unassignRegion(RpcController controller,
1910             UnassignRegionRequest request) throws ServiceException {
1911           return stub.unassignRegion(controller, request);
1912         }
1913 
1914         @Override
1915         public OfflineRegionResponse offlineRegion(RpcController controller,
1916             OfflineRegionRequest request) throws ServiceException {
1917           return stub.offlineRegion(controller, request);
1918         }
1919 
1920         @Override
1921         public DeleteTableResponse deleteTable(RpcController controller,
1922             DeleteTableRequest request) throws ServiceException {
1923           return stub.deleteTable(controller, request);
1924         }
1925 
1926         @Override
1927         public EnableTableResponse enableTable(RpcController controller,
1928             EnableTableRequest request) throws ServiceException {
1929           return stub.enableTable(controller, request);
1930         }
1931 
1932         @Override
1933         public DisableTableResponse disableTable(RpcController controller,
1934             DisableTableRequest request) throws ServiceException {
1935           return stub.disableTable(controller, request);
1936         }
1937 
1938         @Override
1939         public ModifyTableResponse modifyTable(RpcController controller,
1940             ModifyTableRequest request) throws ServiceException {
1941           return stub.modifyTable(controller, request);
1942         }
1943 
1944         @Override
1945         public CreateTableResponse createTable(RpcController controller,
1946             CreateTableRequest request) throws ServiceException {
1947           return stub.createTable(controller, request);
1948         }
1949 
1950         @Override
1951         public ShutdownResponse shutdown(RpcController controller,
1952             ShutdownRequest request) throws ServiceException {
1953           return stub.shutdown(controller, request);
1954         }
1955 
1956         @Override
1957         public StopMasterResponse stopMaster(RpcController controller,
1958             StopMasterRequest request) throws ServiceException {
1959           return stub.stopMaster(controller, request);
1960         }
1961 
1962         @Override
1963         public BalanceResponse balance(RpcController controller,
1964             BalanceRequest request) throws ServiceException {
1965           return stub.balance(controller, request);
1966         }
1967 
1968         @Override
1969         public SetBalancerRunningResponse setBalancerRunning(
1970             RpcController controller, SetBalancerRunningRequest request)
1971             throws ServiceException {
1972           return stub.setBalancerRunning(controller, request);
1973         }
1974 
1975         @Override
1976         public RunCatalogScanResponse runCatalogScan(RpcController controller,
1977             RunCatalogScanRequest request) throws ServiceException {
1978           return stub.runCatalogScan(controller, request);
1979         }
1980 
1981         @Override
1982         public EnableCatalogJanitorResponse enableCatalogJanitor(
1983             RpcController controller, EnableCatalogJanitorRequest request)
1984             throws ServiceException {
1985           return stub.enableCatalogJanitor(controller, request);
1986         }
1987 
1988         @Override
1989         public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(
1990             RpcController controller, IsCatalogJanitorEnabledRequest request)
1991             throws ServiceException {
1992           return stub.isCatalogJanitorEnabled(controller, request);
1993         }
1994 
1995         @Override
1996         public CoprocessorServiceResponse execMasterService(
1997             RpcController controller, CoprocessorServiceRequest request)
1998             throws ServiceException {
1999           return stub.execMasterService(controller, request);
2000         }
2001 
2002         @Override
2003         public SnapshotResponse snapshot(RpcController controller,
2004             SnapshotRequest request) throws ServiceException {
2005           return stub.snapshot(controller, request);
2006         }
2007 
2008         @Override
2009         public GetCompletedSnapshotsResponse getCompletedSnapshots(
2010             RpcController controller, GetCompletedSnapshotsRequest request)
2011             throws ServiceException {
2012           return stub.getCompletedSnapshots(controller, request);
2013         }
2014 
2015         @Override
2016         public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2017             DeleteSnapshotRequest request) throws ServiceException {
2018           return stub.deleteSnapshot(controller, request);
2019         }
2020 
2021         @Override
2022         public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2023             IsSnapshotDoneRequest request) throws ServiceException {
2024           return stub.isSnapshotDone(controller, request);
2025         }
2026 
2027         @Override
2028         public RestoreSnapshotResponse restoreSnapshot(
2029             RpcController controller, RestoreSnapshotRequest request)
2030             throws ServiceException {
2031           return stub.restoreSnapshot(controller, request);
2032         }
2033 
2034         @Override
2035         public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(
2036             RpcController controller, IsRestoreSnapshotDoneRequest request)
2037             throws ServiceException {
2038           return stub.isRestoreSnapshotDone(controller, request);
2039         }
2040 
2041         @Override
2042         public ExecProcedureResponse execProcedure(
2043             RpcController controller, ExecProcedureRequest request)
2044             throws ServiceException {
2045           return stub.execProcedure(controller, request);
2046         }
2047 
2048         @Override
2049         public IsProcedureDoneResponse isProcedureDone(RpcController controller,
2050             IsProcedureDoneRequest request) throws ServiceException {
2051           return stub.isProcedureDone(controller, request);
2052         }
2053 
2054         @Override
2055         public IsMasterRunningResponse isMasterRunning(
2056             RpcController controller, IsMasterRunningRequest request)
2057             throws ServiceException {
2058           return stub.isMasterRunning(controller, request);
2059         }
2060 
2061         @Override
2062         public ModifyNamespaceResponse modifyNamespace(RpcController controller,
2063             ModifyNamespaceRequest request)
2064         throws ServiceException {
2065           return stub.modifyNamespace(controller, request);
2066         }
2067 
2068         @Override
2069         public CreateNamespaceResponse createNamespace(RpcController controller, CreateNamespaceRequest request) throws ServiceException {
2070           return stub.createNamespace(controller, request);
2071         }
2072 
2073         @Override
2074         public DeleteNamespaceResponse deleteNamespace(RpcController controller, DeleteNamespaceRequest request) throws ServiceException {
2075           return stub.deleteNamespace(controller, request);
2076         }
2077 
2078         @Override
2079         public GetNamespaceDescriptorResponse getNamespaceDescriptor(RpcController controller, GetNamespaceDescriptorRequest request) throws ServiceException {
2080           return stub.getNamespaceDescriptor(controller, request);
2081         }
2082 
2083         @Override
2084         public ListNamespaceDescriptorsResponse listNamespaceDescriptors(RpcController controller, ListNamespaceDescriptorsRequest request) throws ServiceException {
2085           return stub.listNamespaceDescriptors(controller, request);
2086         }
2087 
2088         @Override
2089         public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(RpcController controller, ListTableDescriptorsByNamespaceRequest request) throws ServiceException {
2090           return stub.listTableDescriptorsByNamespace(controller, request);
2091         }
2092 
2093         @Override
2094         public ListTableNamesByNamespaceResponse listTableNamesByNamespace(RpcController controller,
2095               ListTableNamesByNamespaceRequest request) throws ServiceException {
2096           return stub.listTableNamesByNamespace(controller, request);
2097         }
2098 
2099         @Override
2100         public void close() {
2101           release(this.mss);
2102         }
2103 
2104         @Override
2105         public GetSchemaAlterStatusResponse getSchemaAlterStatus(
2106             RpcController controller, GetSchemaAlterStatusRequest request)
2107             throws ServiceException {
2108           return stub.getSchemaAlterStatus(controller, request);
2109         }
2110 
2111         @Override
2112         public GetTableDescriptorsResponse getTableDescriptors(
2113             RpcController controller, GetTableDescriptorsRequest request)
2114             throws ServiceException {
2115           return stub.getTableDescriptors(controller, request);
2116         }
2117 
2118         @Override
2119         public GetTableNamesResponse getTableNames(
2120             RpcController controller, GetTableNamesRequest request)
2121             throws ServiceException {
2122           return stub.getTableNames(controller, request);
2123         }
2124 
2125         @Override
2126         public GetClusterStatusResponse getClusterStatus(
2127             RpcController controller, GetClusterStatusRequest request)
2128             throws ServiceException {
2129           return stub.getClusterStatus(controller, request);
2130         }
2131       };
2132     }
2133 
2134 
2135     private static void release(MasterServiceState mss) {
2136       if (mss != null && mss.connection != null) {
2137         ((HConnectionImplementation)mss.connection).releaseMaster(mss);
2138       }
2139     }
2140 
2141     private boolean isKeepAliveMasterConnectedAndRunning(MasterServiceState mss) {
2142       if (mss.getStub() == null){
2143         return false;
2144       }
2145       try {
2146         return mss.isMasterRunning();
2147       } catch (UndeclaredThrowableException e) {
2148         // It's somehow messy, but we can receive exceptions such as
2149         //  java.net.ConnectException but they're not declared. So we catch it...
2150         LOG.info("Master connection is not running anymore", e.getUndeclaredThrowable());
2151         return false;
2152       } catch (ServiceException se) {
2153         LOG.warn("Checking master connection", se);
2154         return false;
2155       }
2156     }
2157 
2158     void releaseMaster(MasterServiceState mss) {
2159       if (mss.getStub() == null) return;
2160       synchronized (masterAndZKLock) {
2161         --mss.userCount;
2162         if (mss.userCount <= 0) {
2163           mss.keepAliveUntil = System.currentTimeMillis() + keepAlive;
2164         }
2165       }
2166     }
2167 
2168     private void closeMasterService(MasterServiceState mss) {
2169       if (mss.getStub() != null) {
2170         LOG.info("Closing master protocol: " + mss);
2171         mss.clearStub();
2172       }
2173       mss.userCount = 0;
2174     }
2175 
2176     /**
2177      * Immediate close of the shared master. Can be by the delayed close or when closing the
2178      * connection itself.
2179      */
2180     private void closeMaster() {
2181       synchronized (masterAndZKLock) {
2182         closeMasterService(masterServiceState);
2183       }
2184     }
2185 
2186     void updateCachedLocation(HRegionInfo hri, HRegionLocation source,
2187                               ServerName serverName, long seqNum) {
2188       HRegionLocation newHrl = new HRegionLocation(hri, serverName, seqNum);
2189       cacheLocation(hri.getTable(), source, newHrl);
2190     }
2191 
2192    /**
2193     * Deletes the cached location of the region if necessary, based on some error from source.
2194     * @param hri The region in question.
2195     * @param source The source of the error that prompts us to invalidate cache.
2196     */
2197    void deleteCachedLocation(HRegionInfo hri, HRegionLocation source) {
2198      ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(hri.getTable());
2199      tableLocations.remove(hri.getStartKey(), source);
2200    }
2201 
2202     @Override
2203     public void deleteCachedRegionLocation(final HRegionLocation location) {
2204       if (location == null) {
2205         return;
2206       }
2207 
2208       HRegionLocation removedLocation;
2209       TableName tableName = location.getRegionInfo().getTable();
2210       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
2211       removedLocation = tableLocations.remove(location.getRegionInfo().getStartKey());
2212       if (LOG.isDebugEnabled() && removedLocation != null) {
2213         LOG.debug("Removed " +
2214             location.getRegionInfo().getRegionNameAsString() +
2215             " for tableName=" + tableName +
2216             " from cache");
2217       }
2218     }
2219 
2220     /**
2221      * Update the location with the new value (if the exception is a RegionMovedException)
2222      * or delete it from the cache. Does nothing if we can be sure from the exception that
2223      * the location is still accurate, or if the cache has already been updated.
2224      * @param exception an object (to simplify user code) on which we will try to find a nested
2225      *                  or wrapped or both RegionMovedException
2226      * @param source server that is the source of the location update.
2227      */
2228     @Override
2229     public void updateCachedLocations(final TableName tableName, byte[] rowkey,
2230       final Object exception, final HRegionLocation source) {
2231       if (rowkey == null || tableName == null) {
2232         LOG.warn("Coding error, see method javadoc. row=" + (rowkey == null ? "null" : rowkey) +
2233             ", tableName=" + (tableName == null ? "null" : tableName));
2234         return;
2235       }
2236 
2237       if (source == null || source.getServerName() == null){
2238         // This should not happen, but let's secure ourselves.
2239         return;
2240       }
2241 
2242       // Is it something we have already updated?
2243       final HRegionLocation oldLocation = getCachedLocation(tableName, rowkey);
2244       if (oldLocation == null || !source.getServerName().equals(oldLocation.getServerName())) {
2245         // There is no such location in the cache (it's been removed already) or
2246         // the cache has already been refreshed with a different location.  => nothing to do
2247         return;
2248       }
2249 
2250       HRegionInfo regionInfo = oldLocation.getRegionInfo();
2251       Throwable cause = findException(exception);
2252       if (cause != null) {
2253         if (cause instanceof RegionTooBusyException || cause instanceof RegionOpeningException) {
2254           // We know that the region is still on this region server
2255           return;
2256         }
2257 
2258         if (cause instanceof RegionMovedException) {
2259           RegionMovedException rme = (RegionMovedException) cause;
2260           if (LOG.isTraceEnabled()) {
2261             LOG.trace("Region " + regionInfo.getRegionNameAsString() + " moved to " +
2262                 rme.getHostname() + ":" + rme.getPort() +
2263                 " according to " + source.getHostnamePort());
2264           }
2265           // We know that the region is not anymore on this region server, but we know
2266           //  the new location.
2267           updateCachedLocation(
2268               regionInfo, source, rme.getServerName(), rme.getLocationSeqNum());
2269           return;
2270         }
2271       }
2272 
2273       // If we're here, it means that can cannot be sure about the location, so we remove it from
2274       //  the cache.
2275       deleteCachedLocation(regionInfo, source);
2276     }
2277 
2278     @Override
2279     public void updateCachedLocations(final byte[] tableName, byte[] rowkey,
2280       final Object exception, final HRegionLocation source) {
2281       updateCachedLocations(TableName.valueOf(tableName), rowkey, exception, source);
2282     }
2283 
2284     @Override
2285     @Deprecated
2286     public void processBatch(List<? extends Row> list,
2287         final TableName tableName,
2288         ExecutorService pool,
2289         Object[] results) throws IOException, InterruptedException {
2290       // This belongs in HTable!!! Not in here.  St.Ack
2291 
2292       // results must be the same size as list
2293       if (results.length != list.size()) {
2294         throw new IllegalArgumentException(
2295           "argument results must be the same size as argument list");
2296       }
2297       processBatchCallback(list, tableName, pool, results, null);
2298     }
2299 
2300     @Override
2301     @Deprecated
2302     public void processBatch(List<? extends Row> list,
2303         final byte[] tableName,
2304         ExecutorService pool,
2305         Object[] results) throws IOException, InterruptedException {
2306       processBatch(list, TableName.valueOf(tableName), pool, results);
2307     }
2308 
2309     /**
2310      * Send the queries in parallel on the different region servers. Retries on failures.
2311      * If the method returns it means that there is no error, and the 'results' array will
2312      * contain no exception. On error, an exception is thrown, and the 'results' array will
2313      * contain results and exceptions.
2314      * @deprecated since 0.96 - Use {@link HTable#processBatchCallback} instead
2315      */
2316     @Override
2317     @Deprecated
2318     public <R> void processBatchCallback(
2319       List<? extends Row> list,
2320       TableName tableName,
2321       ExecutorService pool,
2322       Object[] results,
2323       Batch.Callback<R> callback)
2324       throws IOException, InterruptedException {
2325 
2326       // To fulfill the original contract, we have a special callback. This callback
2327       //  will set the results in the Object array.
2328       ObjectResultFiller<R> cb = new ObjectResultFiller<R>(results, callback);
2329       AsyncProcess<?> asyncProcess = createAsyncProcess(tableName, pool, cb, conf);
2330 
2331       // We're doing a submit all. This way, the originalIndex will match the initial list.
2332       asyncProcess.submitAll(list);
2333       asyncProcess.waitUntilDone();
2334 
2335       if (asyncProcess.hasError()) {
2336         throw asyncProcess.getErrors();
2337       }
2338     }
2339 
2340     @Override
2341     @Deprecated
2342     public <R> void processBatchCallback(
2343       List<? extends Row> list,
2344       byte[] tableName,
2345       ExecutorService pool,
2346       Object[] results,
2347       Batch.Callback<R> callback)
2348       throws IOException, InterruptedException {
2349       processBatchCallback(list, TableName.valueOf(tableName), pool, results, callback);
2350     }
2351 
2352     // For tests.
2353     protected <R> AsyncProcess createAsyncProcess(TableName tableName, ExecutorService pool,
2354            AsyncProcess.AsyncProcessCallback<R> callback, Configuration conf) {
2355       return new AsyncProcess<R>(this, tableName, pool, callback, conf,
2356           RpcRetryingCallerFactory.instantiate(conf), RpcControllerFactory.instantiate(conf));
2357     }
2358 
2359 
2360     /**
2361      * Fill the result array for the interfaces using it.
2362      */
2363     private static class ObjectResultFiller<Res>
2364         implements AsyncProcess.AsyncProcessCallback<Res> {
2365 
2366       private final Object[] results;
2367       private Batch.Callback<Res> callback;
2368 
2369       ObjectResultFiller(Object[] results, Batch.Callback<Res> callback) {
2370         this.results = results;
2371         this.callback = callback;
2372       }
2373 
2374       @Override
2375       public void success(int pos, byte[] region, Row row, Res result) {
2376         assert pos < results.length;
2377         results[pos] = result;
2378         if (callback != null) {
2379           callback.update(region, row.getRow(), result);
2380         }
2381       }
2382 
2383       @Override
2384       public boolean failure(int pos, byte[] region, Row row, Throwable t) {
2385         assert pos < results.length;
2386         results[pos] = t;
2387         //Batch.Callback<Res> was not called on failure in 0.94. We keep this.
2388         return true; // we want to have this failure in the failures list.
2389       }
2390 
2391       @Override
2392       public boolean retriableFailure(int originalIndex, Row row, byte[] region,
2393                                       Throwable exception) {
2394         return true; // we retry
2395       }
2396     }
2397 
2398 
2399     /*
2400      * Return the number of cached region for a table. It will only be called
2401      * from a unit test.
2402      */
2403     int getNumberOfCachedRegionLocations(final TableName tableName) {
2404       Map<byte[], HRegionLocation> tableLocs = this.cachedRegionLocations.get(tableName);
2405       if (tableLocs == null) {
2406         return 0;
2407       }
2408       return tableLocs.values().size();
2409     }
2410 
2411     /**
2412      * Check the region cache to see whether a region is cached yet or not.
2413      * Called by unit tests.
2414      * @param tableName tableName
2415      * @param row row
2416      * @return Region cached or not.
2417      */
2418     boolean isRegionCached(TableName tableName, final byte[] row) {
2419       HRegionLocation location = getCachedLocation(tableName, row);
2420       return location != null;
2421     }
2422 
2423     @Override
2424     public void setRegionCachePrefetch(final TableName tableName,
2425         final boolean enable) {
2426       if (!enable) {
2427         regionCachePrefetchDisabledTables.add(Bytes.mapKey(tableName.getName()));
2428       }
2429       else {
2430         regionCachePrefetchDisabledTables.remove(Bytes.mapKey(tableName.getName()));
2431       }
2432     }
2433 
2434     @Override
2435     public void setRegionCachePrefetch(final byte[] tableName,
2436         final boolean enable) {
2437       setRegionCachePrefetch(TableName.valueOf(tableName), enable);
2438     }
2439 
2440     @Override
2441     public boolean getRegionCachePrefetch(TableName tableName) {
2442       return usePrefetch &&
2443           !regionCachePrefetchDisabledTables.contains(Bytes.mapKey(tableName.getName()));
2444     }
2445 
2446     @Override
2447     public boolean getRegionCachePrefetch(byte[] tableName) {
2448       return getRegionCachePrefetch(TableName.valueOf(tableName));
2449     }
2450 
2451     @Override
2452     public void abort(final String msg, Throwable t) {
2453       if (t instanceof KeeperException.SessionExpiredException
2454         && keepAliveZookeeper != null) {
2455         synchronized (masterAndZKLock) {
2456           if (keepAliveZookeeper != null) {
2457             LOG.warn("This client just lost it's session with ZooKeeper," +
2458               " closing it." +
2459               " It will be recreated next time someone needs it", t);
2460             closeZooKeeperWatcher();
2461           }
2462         }
2463       } else {
2464         if (t != null) {
2465           LOG.fatal(msg, t);
2466         } else {
2467           LOG.fatal(msg);
2468         }
2469         this.aborted = true;
2470         close();
2471         this.closed = true;
2472       }
2473     }
2474 
2475     @Override
2476     public boolean isClosed() {
2477       return this.closed;
2478     }
2479 
2480     @Override
2481     public boolean isAborted(){
2482       return this.aborted;
2483     }
2484 
2485     @Override
2486     public int getCurrentNrHRS() throws IOException {
2487       return this.registry.getCurrentNrHRS();
2488     }
2489 
2490     /**
2491      * Increment this client's reference count.
2492      */
2493     void incCount() {
2494       ++refCount;
2495     }
2496 
2497     /**
2498      * Decrement this client's reference count.
2499      */
2500     void decCount() {
2501       if (refCount > 0) {
2502         --refCount;
2503       }
2504     }
2505 
2506     /**
2507      * Return if this client has no reference
2508      *
2509      * @return true if this client has no reference; false otherwise
2510      */
2511     boolean isZeroReference() {
2512       return refCount == 0;
2513     }
2514 
2515     void internalClose() {
2516       if (this.closed) {
2517         return;
2518       }
2519       delayedClosing.stop("Closing connection");
2520       closeMaster();
2521       shutdownBatchPool();
2522       this.closed = true;
2523       closeZooKeeperWatcher();
2524       this.stubs.clear();
2525       if (clusterStatusListener != null) {
2526         clusterStatusListener.close();
2527       }
2528       if (rpcClient != null) {
2529         rpcClient.stop();
2530       }
2531     }
2532 
2533     @Override
2534     public void close() {
2535       if (managed) {
2536         if (aborted) {
2537           HConnectionManager.deleteStaleConnection(this);
2538         } else {
2539           HConnectionManager.deleteConnection(this, false);
2540         }
2541       } else {
2542         internalClose();
2543       }
2544     }
2545 
2546     /**
2547      * Close the connection for good, regardless of what the current value of
2548      * {@link #refCount} is. Ideally, {@link #refCount} should be zero at this
2549      * point, which would be the case if all of its consumers close the
2550      * connection. However, on the off chance that someone is unable to close
2551      * the connection, perhaps because it bailed out prematurely, the method
2552      * below will ensure that this {@link HConnection} instance is cleaned up.
2553      * Caveat: The JVM may take an unknown amount of time to call finalize on an
2554      * unreachable object, so our hope is that every consumer cleans up after
2555      * itself, like any good citizen.
2556      */
2557     @Override
2558     protected void finalize() throws Throwable {
2559       super.finalize();
2560       // Pretend as if we are about to release the last remaining reference
2561       refCount = 1;
2562       close();
2563     }
2564 
2565     @Override
2566     public HTableDescriptor[] listTables() throws IOException {
2567       MasterKeepAliveConnection master = getKeepAliveMasterService();
2568       try {
2569         GetTableDescriptorsRequest req =
2570           RequestConverter.buildGetTableDescriptorsRequest((List<TableName>)null);
2571         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2572       } catch (ServiceException se) {
2573         throw ProtobufUtil.getRemoteException(se);
2574       } finally {
2575         master.close();
2576       }
2577     }
2578 
2579     @Override
2580     public String[] getTableNames() throws IOException {
2581       TableName[] tableNames = listTableNames();
2582       String result[] = new String[tableNames.length];
2583       for (int i = 0; i < tableNames.length; i++) {
2584         result[i] = tableNames[i].getNameAsString();
2585       }
2586       return result;
2587     }
2588 
2589     @Override
2590     public TableName[] listTableNames() throws IOException {
2591       MasterKeepAliveConnection master = getKeepAliveMasterService();
2592       try {
2593         return ProtobufUtil.getTableNameArray(master.getTableNames(null,
2594             GetTableNamesRequest.newBuilder().build())
2595           .getTableNamesList());
2596       } catch (ServiceException se) {
2597         throw ProtobufUtil.getRemoteException(se);
2598       } finally {
2599         master.close();
2600       }
2601     }
2602 
2603     @Override
2604     public HTableDescriptor[] getHTableDescriptorsByTableName(
2605         List<TableName> tableNames) throws IOException {
2606       if (tableNames == null || tableNames.isEmpty()) return new HTableDescriptor[0];
2607       MasterKeepAliveConnection master = getKeepAliveMasterService();
2608       try {
2609         GetTableDescriptorsRequest req =
2610           RequestConverter.buildGetTableDescriptorsRequest(tableNames);
2611         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2612       } catch (ServiceException se) {
2613         throw ProtobufUtil.getRemoteException(se);
2614       } finally {
2615         master.close();
2616       }
2617     }
2618 
2619     @Override
2620     public HTableDescriptor[] getHTableDescriptors(
2621         List<String> names) throws IOException {
2622       List<TableName> tableNames = new ArrayList(names.size());
2623       for(String name : names) {
2624         tableNames.add(TableName.valueOf(name));
2625       }
2626 
2627       return getHTableDescriptorsByTableName(tableNames);
2628     }
2629 
2630     @Override
2631     public NonceGenerator getNonceGenerator() {
2632       return this.nonceGenerator;
2633     }
2634 
2635     /**
2636      * Connects to the master to get the table descriptor.
2637      * @param tableName table name
2638      * @return
2639      * @throws IOException if the connection to master fails or if the table
2640      *  is not found.
2641      */
2642     @Override
2643     public HTableDescriptor getHTableDescriptor(final TableName tableName)
2644     throws IOException {
2645       if (tableName == null) return null;
2646       if (tableName.equals(TableName.META_TABLE_NAME)) {
2647         return HTableDescriptor.META_TABLEDESC;
2648       }
2649       MasterKeepAliveConnection master = getKeepAliveMasterService();
2650       GetTableDescriptorsResponse htds;
2651       try {
2652         GetTableDescriptorsRequest req =
2653           RequestConverter.buildGetTableDescriptorsRequest(tableName);
2654         htds = master.getTableDescriptors(null, req);
2655       } catch (ServiceException se) {
2656         throw ProtobufUtil.getRemoteException(se);
2657       } finally {
2658         master.close();
2659       }
2660       if (!htds.getTableSchemaList().isEmpty()) {
2661         return HTableDescriptor.convert(htds.getTableSchemaList().get(0));
2662       }
2663       throw new TableNotFoundException(tableName.getNameAsString());
2664     }
2665 
2666     @Override
2667     public HTableDescriptor getHTableDescriptor(final byte[] tableName)
2668     throws IOException {
2669       return getHTableDescriptor(TableName.valueOf(tableName));
2670     }
2671   }
2672 
2673   /**
2674    * The record of errors for servers.
2675    */
2676   static class ServerErrorTracker {
2677     // We need a concurrent map here, as we could have multiple threads updating it in parallel.
2678     private final ConcurrentMap<HRegionLocation, ServerErrors> errorsByServer =
2679         new ConcurrentHashMap<HRegionLocation, ServerErrors>();
2680     private final long canRetryUntil;
2681     private final int maxRetries;
2682     private final String startTrackingTime;
2683 
2684     public ServerErrorTracker(long timeout, int maxRetries) {
2685       this.maxRetries = maxRetries;
2686       this.canRetryUntil = EnvironmentEdgeManager.currentTimeMillis() + timeout;
2687       this.startTrackingTime = new Date().toString();
2688     }
2689 
2690     /**
2691      * We stop to retry when we have exhausted BOTH the number of retries and the time allocated.
2692      */
2693     boolean canRetryMore(int numRetry) {
2694       // If there is a single try we must not take into account the time.
2695       return numRetry < maxRetries || (maxRetries > 1 &&
2696           EnvironmentEdgeManager.currentTimeMillis() < this.canRetryUntil);
2697     }
2698 
2699     /**
2700      * Calculates the back-off time for a retrying request to a particular server.
2701      *
2702      * @param server    The server in question.
2703      * @param basePause The default hci pause.
2704      * @return The time to wait before sending next request.
2705      */
2706     long calculateBackoffTime(HRegionLocation server, long basePause) {
2707       long result;
2708       ServerErrors errorStats = errorsByServer.get(server);
2709       if (errorStats != null) {
2710         result = ConnectionUtils.getPauseTime(basePause, errorStats.retries.get());
2711       } else {
2712         result = 0; // yes, if the server is not in our list we don't wait before retrying.
2713       }
2714       return result;
2715     }
2716 
2717     /**
2718      * Reports that there was an error on the server to do whatever bean-counting necessary.
2719      *
2720      * @param server The server in question.
2721      */
2722     void reportServerError(HRegionLocation server) {
2723       ServerErrors errors = errorsByServer.get(server);
2724       if (errors != null) {
2725         errors.addError();
2726       } else {
2727         errors = errorsByServer.putIfAbsent(server, new ServerErrors());
2728         if (errors != null){
2729           errors.addError();
2730         }
2731       }
2732     }
2733 
2734     String getStartTrackingTime() {
2735       return startTrackingTime;
2736     }
2737 
2738     /**
2739      * The record of errors for a server.
2740      */
2741     private static class ServerErrors {
2742       public final AtomicInteger retries = new AtomicInteger(0);
2743 
2744       public void addError() {
2745         retries.incrementAndGet();
2746       }
2747     }
2748   }
2749 
2750   /**
2751    * Look for an exception we know in the remote exception:
2752    * - hadoop.ipc wrapped exceptions
2753    * - nested exceptions
2754    *
2755    * Looks for: RegionMovedException / RegionOpeningException / RegionTooBusyException
2756    * @return null if we didn't find the exception, the exception otherwise.
2757    */
2758   public static Throwable findException(Object exception) {
2759     if (exception == null || !(exception instanceof Throwable)) {
2760       return null;
2761     }
2762     Throwable cur = (Throwable) exception;
2763     while (cur != null) {
2764       if (cur instanceof RegionMovedException || cur instanceof RegionOpeningException
2765           || cur instanceof RegionTooBusyException) {
2766         return cur;
2767       }
2768       if (cur instanceof RemoteException) {
2769         RemoteException re = (RemoteException) cur;
2770         cur = re.unwrapRemoteException(
2771             RegionOpeningException.class, RegionMovedException.class,
2772             RegionTooBusyException.class);
2773         if (cur == null) {
2774           cur = re.unwrapRemoteException();
2775         }
2776         // unwrapRemoteException can return the exception given as a parameter when it cannot
2777         //  unwrap it. In this case, there is no need to look further
2778         // noinspection ObjectEquality
2779         if (cur == re) {
2780           return null;
2781         }
2782       } else {
2783         cur = cur.getCause();
2784       }
2785     }
2786 
2787     return null;
2788   }
2789 
2790   /**
2791    * Set the number of retries to use serverside when trying to communicate
2792    * with another server over {@link HConnection}.  Used updating catalog
2793    * tables, etc.  Call this method before we create any Connections.
2794    * @param c The Configuration instance to set the retries into.
2795    * @param log Used to log what we set in here.
2796    */
2797   public static void setServerSideHConnectionRetries(final Configuration c, final String sn,
2798       final Log log) {
2799     int hcRetries = c.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
2800       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
2801     // Go big.  Multiply by 10.  If we can't get to meta after this many retries
2802     // then something seriously wrong.
2803     int serversideMultiplier = c.getInt("hbase.client.serverside.retries.multiplier", 10);
2804     int retries = hcRetries * serversideMultiplier;
2805     c.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
2806     log.debug(sn + " HConnection server-to-server retries=" + retries);
2807   }
2808 }