View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import java.io.Closeable;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.lang.reflect.Constructor;
25  import java.lang.reflect.UndeclaredThrowableException;
26  import java.net.SocketException;
27  import java.util.ArrayList;
28  import java.util.Date;
29  import java.util.HashSet;
30  import java.util.LinkedHashMap;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.Map.Entry;
34  import java.util.NavigableMap;
35  import java.util.Set;
36  import java.util.concurrent.ConcurrentHashMap;
37  import java.util.concurrent.ConcurrentMap;
38  import java.util.concurrent.ConcurrentSkipListMap;
39  import java.util.concurrent.ConcurrentSkipListSet;
40  import java.util.concurrent.CopyOnWriteArraySet;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.LinkedBlockingQueue;
43  import java.util.concurrent.ThreadPoolExecutor;
44  import java.util.concurrent.TimeUnit;
45  import java.util.concurrent.atomic.AtomicBoolean;
46  import java.util.concurrent.atomic.AtomicInteger;
47  
48  import org.apache.commons.logging.Log;
49  import org.apache.commons.logging.LogFactory;
50  import org.apache.hadoop.hbase.classification.InterfaceAudience;
51  import org.apache.hadoop.hbase.classification.InterfaceStability;
52  import org.apache.hadoop.conf.Configuration;
53  import org.apache.hadoop.hbase.Chore;
54  import org.apache.hadoop.hbase.HBaseConfiguration;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HRegionLocation;
58  import org.apache.hadoop.hbase.HTableDescriptor;
59  import org.apache.hadoop.hbase.MasterNotRunningException;
60  import org.apache.hadoop.hbase.RegionTooBusyException;
61  import org.apache.hadoop.hbase.ServerName;
62  import org.apache.hadoop.hbase.Stoppable;
63  import org.apache.hadoop.hbase.TableName;
64  import org.apache.hadoop.hbase.TableNotEnabledException;
65  import org.apache.hadoop.hbase.TableNotFoundException;
66  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
67  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
68  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
69  import org.apache.hadoop.hbase.client.coprocessor.Batch;
70  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
71  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
72  import org.apache.hadoop.hbase.ipc.RpcClient;
73  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
74  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
75  import org.apache.hadoop.hbase.protobuf.RequestConverter;
76  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
77  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService;
78  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
79  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
80  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
81  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
82  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
83  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
84  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
85  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
86  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
87  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
88  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
89  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
90  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
91  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
92  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
93  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
94  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
95  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
96  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
97  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
98  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
99  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
100 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
101 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
102 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
103 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
104 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
105 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
106 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
107 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
108 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
109 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
110 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
111 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
112 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
113 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableRequest;
114 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableResponse;
115 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
116 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
117 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
118 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.*;
119 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
120 import org.apache.hadoop.hbase.security.User;
121 import org.apache.hadoop.hbase.security.UserProvider;
122 import org.apache.hadoop.hbase.util.Bytes;
123 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
124 import org.apache.hadoop.hbase.util.ExceptionUtil;
125 import org.apache.hadoop.hbase.util.Threads;
126 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
127 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
128 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
129 import org.apache.hadoop.ipc.RemoteException;
130 import org.apache.zookeeper.KeeperException;
131 
132 import com.google.common.annotations.VisibleForTesting;
133 import com.google.protobuf.BlockingRpcChannel;
134 import com.google.protobuf.RpcController;
135 import com.google.protobuf.ServiceException;
136 
137 /**
138  * A non-instantiable class that manages creation of {@link HConnection}s.
139  * <p>The simplest way to use this class is by using {@link #createConnection(Configuration)}.
140  * This creates a new {@link HConnection} to the cluster that is managed by the caller.
141  * From this {@link HConnection} {@link HTableInterface} implementations are retrieved
142  * with {@link HConnection#getTable(byte[])}. Example:
143  * <pre>
144  * {@code
145  * HConnection connection = HConnectionManager.createConnection(config);
146  * HTableInterface table = connection.getTable("table1");
147  * try {
148  *   // Use the table as needed, for a single operation and a single thread
149  * } finally {
150  *   table.close();
151  *   connection.close();
152  * }
153  * }</pre>
154  * <p>This class has a static Map of {@link HConnection} instances keyed by
155  * {@link HConnectionKey}; A {@link HConnectionKey} is identified by a set of
156  * {@link Configuration} properties. Invocations of {@link #getConnection(Configuration)}
157  * that pass the same {@link Configuration} instance will return the same
158  * {@link  HConnection} instance ONLY WHEN the set of properties are the same
159  * (i.e. if you change properties in your {@link Configuration} instance, such as RPC timeout,
160  * the codec used, HBase will create a new {@link HConnection} instance. For more details on
161  * how this is done see {@link HConnectionKey}).
162  * <p>Sharing {@link HConnection} instances is usually what you want; all clients
163  * of the {@link HConnection} instances share the HConnections' cache of Region
164  * locations rather than each having to discover for itself the location of meta, etc.
165  * But sharing connections makes clean up of {@link HConnection} instances a little awkward.
166  * Currently, clients cleanup by calling {@link #deleteConnection(Configuration)}. This will
167  * shutdown the zookeeper connection the HConnection was using and clean up all
168  * HConnection resources as well as stopping proxies to servers out on the
169  * cluster. Not running the cleanup will not end the world; it'll
170  * just stall the closeup some and spew some zookeeper connection failed
171  * messages into the log.  Running the cleanup on a {@link HConnection} that is
172  * subsequently used by another will cause breakage so be careful running
173  * cleanup.
174  * <p>To create a {@link HConnection} that is not shared by others, you can
175  * set property "hbase.client.instance.id" to a unique value for your {@link Configuration}
176  * instance, like the following:
177  * <pre>
178  * {@code
179  * conf.set("hbase.client.instance.id", "12345");
180  * HConnection connection = HConnectionManager.getConnection(conf);
181  * // Use the connection to your hearts' delight and then when done...
182  * conf.set("hbase.client.instance.id", "12345");
183  * HConnectionManager.deleteConnection(conf, true);
184  * }
185  * </pre>
186  * <p>Cleanup used to be done inside in a shutdown hook.  On startup we'd
187  * register a shutdown hook that called {@link #deleteAllConnections()}
188  * on its way out but the order in which shutdown hooks run is not defined so
189  * were problematic for clients of HConnection that wanted to register their
190  * own shutdown hooks so we removed ours though this shifts the onus for
191  * cleanup to the client.
192  */
193 @SuppressWarnings("serial")
194 @InterfaceAudience.Public
195 @InterfaceStability.Evolving
196 public class HConnectionManager {
197   static final Log LOG = LogFactory.getLog(HConnectionManager.class);
198 
199   public static final String RETRIES_BY_SERVER_KEY = "hbase.client.retries.by.server";
200   private static final String CLIENT_NONCES_ENABLED_KEY = "hbase.client.nonces.enabled";
201 
202   // An LRU Map of HConnectionKey -> HConnection (TableServer).  All
203   // access must be synchronized.  This map is not private because tests
204   // need to be able to tinker with it.
205   static final Map<HConnectionKey, HConnectionImplementation> CONNECTION_INSTANCES;
206 
207   public static final int MAX_CACHED_CONNECTION_INSTANCES;
208 
209   /**
210    * Global nonceGenerator shared per client.Currently there's no reason to limit its scope.
211    * Once it's set under nonceGeneratorCreateLock, it is never unset or changed.
212    */
213   private static volatile NonceGenerator nonceGenerator = null;
214   /** The nonce generator lock. Only taken when creating HConnection, which gets a private copy. */
215   private static Object nonceGeneratorCreateLock = new Object();
216 
217   static {
218     // We set instances to one more than the value specified for {@link
219     // HConstants#ZOOKEEPER_MAX_CLIENT_CNXNS}. By default, the zk default max
220     // connections to the ensemble from the one client is 30, so in that case we
221     // should run into zk issues before the LRU hit this value of 31.
222     MAX_CACHED_CONNECTION_INSTANCES = HBaseConfiguration.create().getInt(
223       HConstants.ZOOKEEPER_MAX_CLIENT_CNXNS, HConstants.DEFAULT_ZOOKEPER_MAX_CLIENT_CNXNS) + 1;
224     CONNECTION_INSTANCES = new LinkedHashMap<HConnectionKey, HConnectionImplementation>(
225         (int) (MAX_CACHED_CONNECTION_INSTANCES / 0.75F) + 1, 0.75F, true) {
226       @Override
227       protected boolean removeEldestEntry(
228           Map.Entry<HConnectionKey, HConnectionImplementation> eldest) {
229          return size() > MAX_CACHED_CONNECTION_INSTANCES;
230        }
231     };
232   }
233 
234   /*
235    * Non-instantiable.
236    */
237   private HConnectionManager() {
238     super();
239   }
240 
241   /**
242    * @param conn The connection for which to replace the generator.
243    * @param cnm Replaces the nonce generator used, for testing.
244    * @return old nonce generator.
245    */
246   @VisibleForTesting
247   public static NonceGenerator injectNonceGeneratorForTesting(
248       HConnection conn, NonceGenerator cnm) {
249     NonceGenerator ng = conn.getNonceGenerator();
250     LOG.warn("Nonce generator is being replaced by test code for " + cnm.getClass().getName());
251     ((HConnectionImplementation)conn).nonceGenerator = cnm;
252     return ng;
253   }
254 
255   /**
256    * Get the connection that goes with the passed <code>conf</code> configuration instance.
257    * If no current connection exists, method creates a new connection and keys it using
258    * connection-specific properties from the passed {@link Configuration}; see
259    * {@link HConnectionKey}.
260    * @param conf configuration
261    * @return HConnection object for <code>conf</code>
262    * @throws ZooKeeperConnectionException
263    */
264   @Deprecated
265   public static HConnection getConnection(final Configuration conf)
266   throws IOException {
267     HConnectionKey connectionKey = new HConnectionKey(conf);
268     synchronized (CONNECTION_INSTANCES) {
269       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
270       if (connection == null) {
271         connection = (HConnectionImplementation)createConnection(conf, true);
272         CONNECTION_INSTANCES.put(connectionKey, connection);
273       } else if (connection.isClosed()) {
274         HConnectionManager.deleteConnection(connectionKey, true);
275         connection = (HConnectionImplementation)createConnection(conf, true);
276         CONNECTION_INSTANCES.put(connectionKey, connection);
277       }
278       connection.incCount();
279       return connection;
280     }
281   }
282 
283   /**
284    * Create a new HConnection instance using the passed <code>conf</code> instance.
285    * <p>Note: This bypasses the usual HConnection life cycle management done by
286    * {@link #getConnection(Configuration)}. The caller is responsible for
287    * calling {@link HConnection#close()} on the returned connection instance.
288    *
289    * This is the recommended way to create HConnections.
290    * {@code
291    * HConnection connection = HConnectionManager.createConnection(conf);
292    * HTableInterface table = connection.getTable("mytable");
293    * table.get(...);
294    * ...
295    * table.close();
296    * connection.close();
297    * }
298    *
299    * @param conf configuration
300    * @return HConnection object for <code>conf</code>
301    * @throws ZooKeeperConnectionException
302    */
303   public static HConnection createConnection(Configuration conf)
304   throws IOException {
305     UserProvider provider = UserProvider.instantiate(conf);
306     return createConnection(conf, false, null, provider.getCurrent());
307   }
308 
309   /**
310    * Create a new HConnection instance using the passed <code>conf</code> instance.
311    * <p>Note: This bypasses the usual HConnection life cycle management done by
312    * {@link #getConnection(Configuration)}. The caller is responsible for
313    * calling {@link HConnection#close()} on the returned connection instance.
314    * This is the recommended way to create HConnections.
315    * {@code
316    * ExecutorService pool = ...;
317    * HConnection connection = HConnectionManager.createConnection(conf, pool);
318    * HTableInterface table = connection.getTable("mytable");
319    * table.get(...);
320    * ...
321    * table.close();
322    * connection.close();
323    * }
324    * @param conf configuration
325    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
326    * @return HConnection object for <code>conf</code>
327    * @throws ZooKeeperConnectionException
328    */
329   public static HConnection createConnection(Configuration conf, ExecutorService pool)
330   throws IOException {
331     UserProvider provider = UserProvider.instantiate(conf);
332     return createConnection(conf, false, pool, provider.getCurrent());
333   }
334 
335   /**
336    * Create a new HConnection instance using the passed <code>conf</code> instance.
337    * <p>Note: This bypasses the usual HConnection life cycle management done by
338    * {@link #getConnection(Configuration)}. The caller is responsible for
339    * calling {@link HConnection#close()} on the returned connection instance.
340    * This is the recommended way to create HConnections.
341    * {@code
342    * ExecutorService pool = ...;
343    * HConnection connection = HConnectionManager.createConnection(conf, pool);
344    * HTableInterface table = connection.getTable("mytable");
345    * table.get(...);
346    * ...
347    * table.close();
348    * connection.close();
349    * }
350    * @param conf configuration
351    * @param user the user the connection is for
352    * @return HConnection object for <code>conf</code>
353    * @throws ZooKeeperConnectionException
354    */
355   public static HConnection createConnection(Configuration conf, User user)
356   throws IOException {
357     return createConnection(conf, false, null, user);
358   }
359 
360   /**
361    * Create a new HConnection instance using the passed <code>conf</code> instance.
362    * <p>Note: This bypasses the usual HConnection life cycle management done by
363    * {@link #getConnection(Configuration)}. The caller is responsible for
364    * calling {@link HConnection#close()} on the returned connection instance.
365    * This is the recommended way to create HConnections.
366    * {@code
367    * ExecutorService pool = ...;
368    * HConnection connection = HConnectionManager.createConnection(conf, pool);
369    * HTableInterface table = connection.getTable("mytable");
370    * table.get(...);
371    * ...
372    * table.close();
373    * connection.close();
374    * }
375    * @param conf configuration
376    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
377    * @param user the user the connection is for
378    * @return HConnection object for <code>conf</code>
379    * @throws ZooKeeperConnectionException
380    */
381   public static HConnection createConnection(Configuration conf, ExecutorService pool, User user)
382   throws IOException {
383     return createConnection(conf, false, pool, user);
384   }
385 
386   @Deprecated
387   static HConnection createConnection(final Configuration conf, final boolean managed)
388       throws IOException {
389     UserProvider provider = UserProvider.instantiate(conf);
390     return createConnection(conf, managed, null, provider.getCurrent());
391   }
392 
393   @Deprecated
394   static HConnection createConnection(final Configuration conf, final boolean managed,
395       final ExecutorService pool, final User user)
396   throws IOException {
397     String className = conf.get("hbase.client.connection.impl",
398       HConnectionManager.HConnectionImplementation.class.getName());
399     Class<?> clazz = null;
400     try {
401       clazz = Class.forName(className);
402     } catch (ClassNotFoundException e) {
403       throw new IOException(e);
404     }
405     try {
406       // Default HCM#HCI is not accessible; make it so before invoking.
407       Constructor<?> constructor =
408         clazz.getDeclaredConstructor(Configuration.class,
409           boolean.class, ExecutorService.class, User.class);
410       constructor.setAccessible(true);
411       return (HConnection) constructor.newInstance(conf, managed, pool, user);
412     } catch (Exception e) {
413       throw new IOException(e);
414     }
415   }
416 
417   /**
418    * Delete connection information for the instance specified by passed configuration.
419    * If there are no more references to the designated connection connection, this method will
420    * then close connection to the zookeeper ensemble and let go of all associated resources.
421    *
422    * @param conf configuration whose identity is used to find {@link HConnection} instance.
423    * @deprecated
424    */
425   public static void deleteConnection(Configuration conf) {
426     deleteConnection(new HConnectionKey(conf), false);
427   }
428 
429   /**
430    * Cleanup a known stale connection.
431    * This will then close connection to the zookeeper ensemble and let go of all resources.
432    *
433    * @param connection
434    * @deprecated
435    */
436   public static void deleteStaleConnection(HConnection connection) {
437     deleteConnection(connection, true);
438   }
439 
440   /**
441    * Delete information for all connections. Close or not the connection, depending on the
442    *  staleConnection boolean and the ref count. By default, you should use it with
443    *  staleConnection to true.
444    * @deprecated
445    */
446   public static void deleteAllConnections(boolean staleConnection) {
447     synchronized (CONNECTION_INSTANCES) {
448       Set<HConnectionKey> connectionKeys = new HashSet<HConnectionKey>();
449       connectionKeys.addAll(CONNECTION_INSTANCES.keySet());
450       for (HConnectionKey connectionKey : connectionKeys) {
451         deleteConnection(connectionKey, staleConnection);
452       }
453       CONNECTION_INSTANCES.clear();
454     }
455   }
456 
457   /**
458    * Delete information for all connections..
459    * @deprecated kept for backward compatibility, but the behavior is broken. HBASE-8983
460    */
461   @Deprecated
462   public static void deleteAllConnections() {
463     deleteAllConnections(false);
464   }
465 
466 
467   @Deprecated
468   private static void deleteConnection(HConnection connection, boolean staleConnection) {
469     synchronized (CONNECTION_INSTANCES) {
470       for (Entry<HConnectionKey, HConnectionImplementation> e: CONNECTION_INSTANCES.entrySet()) {
471         if (e.getValue() == connection) {
472           deleteConnection(e.getKey(), staleConnection);
473           break;
474         }
475       }
476     }
477   }
478 
479   @Deprecated
480   private static void deleteConnection(HConnectionKey connectionKey, boolean staleConnection) {
481     synchronized (CONNECTION_INSTANCES) {
482       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
483       if (connection != null) {
484         connection.decCount();
485         if (connection.isZeroReference() || staleConnection) {
486           CONNECTION_INSTANCES.remove(connectionKey);
487           connection.internalClose();
488         }
489       } else {
490         LOG.error("Connection not found in the list, can't delete it "+
491           "(connection key=" + connectionKey + "). May be the key was modified?", new Exception());
492       }
493     }
494   }
495 
496   /**
497    * It is provided for unit test cases which verify the behavior of region
498    * location cache prefetch.
499    * @return Number of cached regions for the table.
500    * @throws ZooKeeperConnectionException
501    */
502   static int getCachedRegionCount(Configuration conf, final TableName tableName)
503   throws IOException {
504     return execute(new HConnectable<Integer>(conf) {
505       @Override
506       public Integer connect(HConnection connection) {
507         return ((HConnectionImplementation)connection).getNumberOfCachedRegionLocations(tableName);
508       }
509     });
510   }
511 
512   /**
513    * This convenience method invokes the given {@link HConnectable#connect}
514    * implementation using a {@link HConnection} instance that lasts just for the
515    * duration of the invocation.
516    *
517    * @param <T> the return type of the connect method
518    * @param connectable the {@link HConnectable} instance
519    * @return the value returned by the connect method
520    * @throws IOException
521    */
522   @InterfaceAudience.Private
523   public static <T> T execute(HConnectable<T> connectable) throws IOException {
524     if (connectable == null || connectable.conf == null) {
525       return null;
526     }
527     Configuration conf = connectable.conf;
528     HConnection connection = HConnectionManager.getConnection(conf);
529     boolean connectSucceeded = false;
530     try {
531       T returnValue = connectable.connect(connection);
532       connectSucceeded = true;
533       return returnValue;
534     } finally {
535       try {
536         connection.close();
537       } catch (Exception e) {
538         ExceptionUtil.rethrowIfInterrupt(e);
539         if (connectSucceeded) {
540           throw new IOException("The connection to " + connection
541               + " could not be deleted.", e);
542         }
543       }
544     }
545   }
546 
547   /** Encapsulates connection to zookeeper and regionservers.*/
548   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
549       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
550       justification="Access to the conncurrent hash map is under a lock so should be fine.")
551   public static class HConnectionImplementation implements HConnection, Closeable {
552     static final Log LOG = LogFactory.getLog(HConnectionImplementation.class);
553     private final long pause;
554     private final int numTries;
555     final int rpcTimeout;
556     private NonceGenerator nonceGenerator = null;
557     private final boolean usePrefetch;
558     private final int prefetchRegionLimit;
559 
560     private volatile boolean closed;
561     private volatile boolean aborted;
562 
563     // package protected for the tests
564     ClusterStatusListener clusterStatusListener;
565 
566     private final Object userRegionLock = new Object();
567 
568     // We have a single lock for master & zk to prevent deadlocks. Having
569     //  one lock for ZK and one lock for master is not possible:
570     //  When creating a connection to master, we need a connection to ZK to get
571     //  its address. But another thread could have taken the ZK lock, and could
572     //  be waiting for the master lock => deadlock.
573     private final Object masterAndZKLock = new Object();
574 
575     private long keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
576     private final DelayedClosing delayedClosing =
577       DelayedClosing.createAndStart(this);
578 
579     // thread executor shared by all HTableInterface instances created
580     // by this connection
581     private volatile ExecutorService batchPool = null;
582     private volatile boolean cleanupPool = false;
583 
584     private final Configuration conf;
585 
586     // Client rpc instance.
587     private RpcClient rpcClient;
588 
589     /**
590       * Map of table to table {@link HRegionLocation}s.
591       */
592     private final ConcurrentMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>
593         cachedRegionLocations =
594       new ConcurrentHashMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>();
595 
596     // The presence of a server in the map implies it's likely that there is an
597     // entry in cachedRegionLocations that map to this server; but the absence
598     // of a server in this map guarentees that there is no entry in cache that
599     // maps to the absent server.
600     // The access to this attribute must be protected by a lock on cachedRegionLocations
601     private final Set<ServerName> cachedServers = new ConcurrentSkipListSet<ServerName>();
602 
603     // region cache prefetch is enabled by default. this set contains all
604     // tables whose region cache prefetch are disabled.
605     private final Set<Integer> regionCachePrefetchDisabledTables =
606       new CopyOnWriteArraySet<Integer>();
607 
608     private int refCount;
609 
610     // indicates whether this connection's life cycle is managed (by us)
611     private boolean managed;
612 
613     private User user;
614 
615     /**
616      * Cluster registry of basic info such as clusterid and meta region location.
617      */
618      Registry registry;
619 
620      HConnectionImplementation(Configuration conf, boolean managed) throws IOException {
621        this(conf, managed, null, null);
622      }
623 
624     /**
625      * constructor
626      * @param conf Configuration object
627      * @param managed If true, does not do full shutdown on close; i.e. cleanup of connection
628      * to zk and shutdown of all services; we just close down the resources this connection was
629      * responsible for and decrement usage counters.  It is up to the caller to do the full
630      * cleanup.  It is set when we want have connection sharing going on -- reuse of zk connection,
631      * and cached region locations, established regionserver connections, etc.  When connections
632      * are shared, we have reference counting going on and will only do full cleanup when no more
633      * users of an HConnectionImplementation instance.
634      */
635     HConnectionImplementation(Configuration conf, boolean managed,
636         ExecutorService pool, User user) throws IOException {
637       this(conf);
638       this.user = user;
639       this.batchPool = pool;
640       this.managed = managed;
641       this.registry = setupRegistry();
642       retrieveClusterId();
643 
644       this.rpcClient = new RpcClient(this.conf, this.clusterId);
645 
646       // Do we publish the status?
647       boolean shouldListen = conf.getBoolean(HConstants.STATUS_PUBLISHED,
648           HConstants.STATUS_PUBLISHED_DEFAULT);
649       Class<? extends ClusterStatusListener.Listener> listenerClass =
650           conf.getClass(ClusterStatusListener.STATUS_LISTENER_CLASS,
651               ClusterStatusListener.DEFAULT_STATUS_LISTENER_CLASS,
652               ClusterStatusListener.Listener.class);
653       if (shouldListen) {
654         if (listenerClass == null) {
655           LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
656               ClusterStatusListener.STATUS_LISTENER_CLASS + " is not set - not listening status");
657         } else {
658           clusterStatusListener = new ClusterStatusListener(
659               new ClusterStatusListener.DeadServerHandler() {
660                 @Override
661                 public void newDead(ServerName sn) {
662                   clearCaches(sn);
663                   rpcClient.cancelConnections(sn.getHostname(), sn.getPort(),
664                       new SocketException(sn.getServerName() +
665                           " is dead: closing its connection."));
666                 }
667               }, conf, listenerClass);
668         }
669       }
670     }
671 
672     /** Dummy nonce generator for disabled nonces. */
673     private static class NoNonceGenerator implements NonceGenerator {
674       @Override
675       public long getNonceGroup() {
676         return HConstants.NO_NONCE;
677       }
678       @Override
679       public long newNonce() {
680         return HConstants.NO_NONCE;
681       }
682     }
683 
684     /**
685      * For tests.
686      */
687     protected HConnectionImplementation(Configuration conf) {
688       this.conf = conf;
689       this.closed = false;
690       this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
691           HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
692       this.numTries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
693           HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
694       this.rpcTimeout = conf.getInt(
695           HConstants.HBASE_RPC_TIMEOUT_KEY,
696           HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
697       if (conf.getBoolean(CLIENT_NONCES_ENABLED_KEY, true)) {
698         synchronized (HConnectionManager.nonceGeneratorCreateLock) {
699           if (HConnectionManager.nonceGenerator == null) {
700             HConnectionManager.nonceGenerator = new PerClientRandomNonceGenerator();
701           }
702           this.nonceGenerator = HConnectionManager.nonceGenerator;
703         }
704       } else {
705         this.nonceGenerator = new NoNonceGenerator();
706       }
707 
708       this.usePrefetch = conf.getBoolean(HConstants.HBASE_CLIENT_PREFETCH,
709           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH);
710       this.prefetchRegionLimit = conf.getInt(
711           HConstants.HBASE_CLIENT_PREFETCH_LIMIT,
712           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH_LIMIT);
713     }
714 
715     @Override
716     public HTableInterface getTable(String tableName) throws IOException {
717       return getTable(TableName.valueOf(tableName));
718     }
719 
720     @Override
721     public HTableInterface getTable(byte[] tableName) throws IOException {
722       return getTable(TableName.valueOf(tableName));
723     }
724 
725     @Override
726     public HTableInterface getTable(TableName tableName) throws IOException {
727       return getTable(tableName, getBatchPool());
728     }
729 
730     @Override
731     public HTableInterface getTable(String tableName, ExecutorService pool) throws IOException {
732       return getTable(TableName.valueOf(tableName), pool);
733     }
734 
735     @Override
736     public HTableInterface getTable(byte[] tableName, ExecutorService pool) throws IOException {
737       return getTable(TableName.valueOf(tableName), pool);
738     }
739 
740     @Override
741     public HTableInterface getTable(TableName tableName, ExecutorService pool) throws IOException {
742       if (managed) {
743         throw new IOException("The connection has to be unmanaged.");
744       }
745       return new HTable(tableName, this, pool);
746     }
747 
748     private ExecutorService getBatchPool() {
749       if (batchPool == null) {
750         // shared HTable thread executor not yet initialized
751         synchronized (this) {
752           if (batchPool == null) {
753             int maxThreads = conf.getInt("hbase.hconnection.threads.max", 256);
754             int coreThreads = conf.getInt("hbase.hconnection.threads.core", 256);
755             if (maxThreads == 0) {
756               maxThreads = Runtime.getRuntime().availableProcessors() * 8;
757             }
758             if (coreThreads == 0) {
759               coreThreads = Runtime.getRuntime().availableProcessors() * 8;
760             }
761             long keepAliveTime = conf.getLong("hbase.hconnection.threads.keepalivetime", 60);
762             LinkedBlockingQueue<Runnable> workQueue =
763               new LinkedBlockingQueue<Runnable>(maxThreads *
764                 conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
765                   HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
766             ThreadPoolExecutor tpe = new ThreadPoolExecutor(
767                 coreThreads,
768                 maxThreads,
769                 keepAliveTime,
770                 TimeUnit.SECONDS,
771                 workQueue,
772                 Threads.newDaemonThreadFactory(toString() + "-shared-"));
773             tpe.allowCoreThreadTimeOut(true);
774             this.batchPool = tpe;
775           }
776           this.cleanupPool = true;
777         }
778       }
779       return this.batchPool;
780     }
781 
782     protected ExecutorService getCurrentBatchPool() {
783       return batchPool;
784     }
785 
786     private void shutdownBatchPool() {
787       if (this.cleanupPool && this.batchPool != null && !this.batchPool.isShutdown()) {
788         this.batchPool.shutdown();
789         try {
790           if (!this.batchPool.awaitTermination(10, TimeUnit.SECONDS)) {
791             this.batchPool.shutdownNow();
792           }
793         } catch (InterruptedException e) {
794           this.batchPool.shutdownNow();
795         }
796       }
797     }
798 
799     /**
800      * @return The cluster registry implementation to use.
801      * @throws IOException
802      */
803     private Registry setupRegistry() throws IOException {
804       String registryClass = this.conf.get("hbase.client.registry.impl",
805         ZooKeeperRegistry.class.getName());
806       Registry registry = null;
807       try {
808         registry = (Registry)Class.forName(registryClass).newInstance();
809       } catch (Throwable t) {
810         throw new IOException(t);
811       }
812       registry.init(this);
813       return registry;
814     }
815 
816     /**
817      * For tests only.
818      * @param rpcClient Client we should use instead.
819      * @return Previous rpcClient
820      */
821     RpcClient setRpcClient(final RpcClient rpcClient) {
822       RpcClient oldRpcClient = this.rpcClient;
823       this.rpcClient = rpcClient;
824       return oldRpcClient;
825     }
826 
827     /**
828      * An identifier that will remain the same for a given connection.
829      * @return
830      */
831     public String toString(){
832       return "hconnection-0x" + Integer.toHexString(hashCode());
833     }
834 
835     protected String clusterId = null;
836 
837     void retrieveClusterId() {
838       if (clusterId != null) return;
839       this.clusterId = this.registry.getClusterId();
840       if (clusterId == null) {
841         clusterId = HConstants.CLUSTER_ID_DEFAULT;
842         LOG.debug("clusterid came back null, using default " + clusterId);
843       }
844     }
845 
846     @Override
847     public Configuration getConfiguration() {
848       return this.conf;
849     }
850 
851     private void checkIfBaseNodeAvailable(ZooKeeperWatcher zkw)
852       throws MasterNotRunningException {
853       String errorMsg;
854       try {
855         if (ZKUtil.checkExists(zkw, zkw.baseZNode) == -1) {
856           errorMsg = "The node " + zkw.baseZNode+" is not in ZooKeeper. "
857             + "It should have been written by the master. "
858             + "Check the value configured in 'zookeeper.znode.parent'. "
859             + "There could be a mismatch with the one configured in the master.";
860           LOG.error(errorMsg);
861           throw new MasterNotRunningException(errorMsg);
862         }
863       } catch (KeeperException e) {
864         errorMsg = "Can't get connection to ZooKeeper: " + e.getMessage();
865         LOG.error(errorMsg);
866         throw new MasterNotRunningException(errorMsg, e);
867       }
868     }
869 
870     /**
871      * @return true if the master is running, throws an exception otherwise
872      * @throws MasterNotRunningException - if the master is not running
873      * @throws ZooKeeperConnectionException
874      */
875     @Override
876     public boolean isMasterRunning()
877     throws MasterNotRunningException, ZooKeeperConnectionException {
878       // When getting the master connection, we check it's running,
879       // so if there is no exception, it means we've been able to get a
880       // connection on a running master
881       MasterKeepAliveConnection m = getKeepAliveMasterService();
882       m.close();
883       return true;
884     }
885 
886     @Override
887     public HRegionLocation getRegionLocation(final TableName tableName,
888         final byte [] row, boolean reload)
889     throws IOException {
890       return reload? relocateRegion(tableName, row): locateRegion(tableName, row);
891     }
892 
893     @Override
894     public HRegionLocation getRegionLocation(final byte[] tableName,
895         final byte [] row, boolean reload)
896     throws IOException {
897       return getRegionLocation(TableName.valueOf(tableName), row, reload);
898     }
899 
900     @Override
901     public boolean isTableEnabled(TableName tableName) throws IOException {
902       return this.registry.isTableOnlineState(tableName, true);
903     }
904 
905     @Override
906     public boolean isTableEnabled(byte[] tableName) throws IOException {
907       return isTableEnabled(TableName.valueOf(tableName));
908     }
909 
910     @Override
911     public boolean isTableDisabled(TableName tableName) throws IOException {
912       return this.registry.isTableOnlineState(tableName, false);
913     }
914 
915     @Override
916     public boolean isTableDisabled(byte[] tableName) throws IOException {
917       return isTableDisabled(TableName.valueOf(tableName));
918     }
919 
920     @Override
921     public boolean isTableAvailable(final TableName tableName) throws IOException {
922       final AtomicBoolean available = new AtomicBoolean(true);
923       final AtomicInteger regionCount = new AtomicInteger(0);
924       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
925         @Override
926         public boolean processRow(Result row) throws IOException {
927           HRegionInfo info = MetaScanner.getHRegionInfo(row);
928           if (info != null && !info.isSplitParent()) {
929             if (tableName.equals(info.getTable())) {
930               ServerName server = HRegionInfo.getServerName(row);
931               if (server == null) {
932                 available.set(false);
933                 return false;
934               }
935               regionCount.incrementAndGet();
936             } else if (tableName.compareTo(info.getTable()) < 0) {
937               // Return if we are done with the current table
938               return false;
939             }
940           }
941           return true;
942         }
943       };
944       MetaScanner.metaScan(conf, this, visitor, tableName);
945       return available.get() && (regionCount.get() > 0);
946     }
947 
948     @Override
949     public boolean isTableAvailable(final byte[] tableName) throws IOException {
950       return isTableAvailable(TableName.valueOf(tableName));
951     }
952 
953     @Override
954     public boolean isTableAvailable(final TableName tableName, final byte[][] splitKeys)
955         throws IOException {
956       final AtomicBoolean available = new AtomicBoolean(true);
957       final AtomicInteger regionCount = new AtomicInteger(0);
958       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
959         @Override
960         public boolean processRow(Result row) throws IOException {
961           HRegionInfo info = MetaScanner.getHRegionInfo(row);
962           if (info != null && !info.isSplitParent()) {
963             if (tableName.equals(info.getTable())) {
964               ServerName server = HRegionInfo.getServerName(row);
965               if (server == null) {
966                 available.set(false);
967                 return false;
968               }
969               if (!Bytes.equals(info.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
970                 for (byte[] splitKey : splitKeys) {
971                   // Just check if the splitkey is available
972                   if (Bytes.equals(info.getStartKey(), splitKey)) {
973                     regionCount.incrementAndGet();
974                     break;
975                   }
976                 }
977               } else {
978                 // Always empty start row should be counted
979                 regionCount.incrementAndGet();
980               }
981             } else if (tableName.compareTo(info.getTable()) < 0) {
982               // Return if we are done with the current table
983               return false;
984             }
985           }
986           return true;
987         }
988       };
989       MetaScanner.metaScan(conf, this, visitor, tableName);
990       // +1 needs to be added so that the empty start row is also taken into account
991       return available.get() && (regionCount.get() == splitKeys.length + 1);
992     }
993 
994     @Override
995     public boolean isTableAvailable(final byte[] tableName, final byte[][] splitKeys)
996         throws IOException {
997       return isTableAvailable(TableName.valueOf(tableName), splitKeys);
998     }
999 
1000     @Override
1001     public HRegionLocation locateRegion(final byte[] regionName) throws IOException {
1002       return locateRegion(HRegionInfo.getTable(regionName),
1003           HRegionInfo.getStartKey(regionName), false, true);
1004     }
1005 
1006     @Override
1007     public boolean isDeadServer(ServerName sn) {
1008       if (clusterStatusListener == null) {
1009         return false;
1010       } else {
1011         return clusterStatusListener.isDeadServer(sn);
1012       }
1013     }
1014 
1015     @Override
1016     public List<HRegionLocation> locateRegions(final TableName tableName)
1017     throws IOException {
1018       return locateRegions (tableName, false, true);
1019     }
1020 
1021     @Override
1022     public List<HRegionLocation> locateRegions(final byte[] tableName)
1023     throws IOException {
1024       return locateRegions(TableName.valueOf(tableName));
1025     }
1026 
1027     @Override
1028     public List<HRegionLocation> locateRegions(final TableName tableName,
1029         final boolean useCache, final boolean offlined) throws IOException {
1030       NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(conf, this,
1031           tableName, offlined);
1032       final List<HRegionLocation> locations = new ArrayList<HRegionLocation>();
1033       for (HRegionInfo regionInfo : regions.keySet()) {
1034         locations.add(locateRegion(tableName, regionInfo.getStartKey(), useCache, true));
1035       }
1036       return locations;
1037     }
1038 
1039     @Override
1040     public List<HRegionLocation> locateRegions(final byte[] tableName,
1041        final boolean useCache, final boolean offlined) throws IOException {
1042       return locateRegions(TableName.valueOf(tableName), useCache, offlined);
1043     }
1044 
1045     @Override
1046     public HRegionLocation locateRegion(final TableName tableName,
1047         final byte [] row)
1048     throws IOException{
1049       return locateRegion(tableName, row, true, true);
1050     }
1051 
1052     @Override
1053     public HRegionLocation locateRegion(final byte[] tableName,
1054         final byte [] row)
1055     throws IOException{
1056       return locateRegion(TableName.valueOf(tableName), row);
1057     }
1058 
1059     @Override
1060     public HRegionLocation relocateRegion(final TableName tableName,
1061         final byte [] row) throws IOException{
1062       // Since this is an explicit request not to use any caching, finding
1063       // disabled tables should not be desirable.  This will ensure that an exception is thrown when
1064       // the first time a disabled table is interacted with.
1065       if (isTableDisabled(tableName)) {
1066         throw new TableNotEnabledException(tableName.getNameAsString() + " is disabled.");
1067       }
1068 
1069       return locateRegion(tableName, row, false, true);
1070     }
1071 
1072     @Override
1073     public HRegionLocation relocateRegion(final byte[] tableName,
1074         final byte [] row) throws IOException {
1075       return relocateRegion(TableName.valueOf(tableName), row);
1076     }
1077 
1078 
1079     private HRegionLocation locateRegion(final TableName tableName,
1080       final byte [] row, boolean useCache, boolean retry)
1081     throws IOException {
1082       if (this.closed) throw new IOException(toString() + " closed");
1083       if (tableName== null || tableName.getName().length == 0) {
1084         throw new IllegalArgumentException(
1085             "table name cannot be null or zero length");
1086       }
1087 
1088       if (tableName.equals(TableName.META_TABLE_NAME)) {
1089         return this.registry.getMetaRegionLocation();
1090       } else {
1091         // Region not in the cache - have to go to the meta RS
1092         return locateRegionInMeta(TableName.META_TABLE_NAME, tableName, row,
1093           useCache, userRegionLock, retry);
1094       }
1095     }
1096 
1097     /*
1098      * Search hbase:meta for the HRegionLocation info that contains the table and
1099      * row we're seeking. It will prefetch certain number of regions info and
1100      * save them to the global region cache.
1101      */
1102     private void prefetchRegionCache(final TableName tableName,
1103         final byte[] row) {
1104       // Implement a new visitor for MetaScanner, and use it to walk through
1105       // the hbase:meta
1106       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1107         public boolean processRow(Result result) throws IOException {
1108           try {
1109             HRegionInfo regionInfo = MetaScanner.getHRegionInfo(result);
1110             if (regionInfo == null) {
1111               return true;
1112             }
1113 
1114             // possible we got a region of a different table...
1115             if (!regionInfo.getTable().equals(tableName)) {
1116               return false; // stop scanning
1117             }
1118             if (regionInfo.isOffline()) {
1119               // don't cache offline regions
1120               return true;
1121             }
1122 
1123             ServerName serverName = HRegionInfo.getServerName(result);
1124             if (serverName == null) {
1125               return true; // don't cache it
1126             }
1127             // instantiate the location
1128             long seqNum = HRegionInfo.getSeqNumDuringOpen(result);
1129             HRegionLocation loc = new HRegionLocation(regionInfo, serverName, seqNum);
1130             // cache this meta entry
1131             cacheLocation(tableName, null, loc);
1132             return true;
1133           } catch (RuntimeException e) {
1134             throw new IOException(e);
1135           }
1136         }
1137       };
1138       try {
1139         // pre-fetch certain number of regions info at region cache.
1140         MetaScanner.metaScan(conf, this, visitor, tableName, row,
1141             this.prefetchRegionLimit, TableName.META_TABLE_NAME);
1142       } catch (IOException e) {
1143         if (ExceptionUtil.isInterrupt(e)) {
1144           Thread.currentThread().interrupt();
1145         }
1146       }
1147     }
1148 
1149     /*
1150       * Search the hbase:meta table for the HRegionLocation
1151       * info that contains the table and row we're seeking.
1152       */
1153     private HRegionLocation locateRegionInMeta(final TableName parentTable,
1154       final TableName tableName, final byte [] row, boolean useCache,
1155       Object regionLockObject, boolean retry)
1156     throws IOException {
1157       HRegionLocation location;
1158       // If we are supposed to be using the cache, look in the cache to see if
1159       // we already have the region.
1160       if (useCache) {
1161         location = getCachedLocation(tableName, row);
1162         if (location != null) {
1163           return location;
1164         }
1165       }
1166       int localNumRetries = retry ? numTries : 1;
1167       // build the key of the meta region we should be looking for.
1168       // the extra 9's on the end are necessary to allow "exact" matches
1169       // without knowing the precise region names.
1170       byte [] metaKey = HRegionInfo.createRegionName(tableName, row,
1171         HConstants.NINES, false);
1172       for (int tries = 0; true; tries++) {
1173         if (tries >= localNumRetries) {
1174           throw new NoServerForRegionException("Unable to find region for "
1175             + Bytes.toStringBinary(row) + " after " + numTries + " tries.");
1176         }
1177 
1178         HRegionLocation metaLocation = null;
1179         try {
1180           // locate the meta region
1181           metaLocation = locateRegion(parentTable, metaKey, true, false);
1182           // If null still, go around again.
1183           if (metaLocation == null) continue;
1184           ClientService.BlockingInterface service = getClient(metaLocation.getServerName());
1185 
1186           Result regionInfoRow;
1187           // This block guards against two threads trying to load the meta
1188           // region at the same time. The first will load the meta region and
1189           // the second will use the value that the first one found.
1190           if (useCache) {
1191             if (TableName.META_TABLE_NAME.equals(parentTable) && usePrefetch &&
1192                 getRegionCachePrefetch(tableName)) {
1193               synchronized (regionLockObject) {
1194                 // Check the cache again for a hit in case some other thread made the
1195                 // same query while we were waiting on the lock.
1196                 location = getCachedLocation(tableName, row);
1197                 if (location != null) {
1198                   return location;
1199                 }
1200                 // If the parent table is META, we may want to pre-fetch some
1201                 // region info into the global region cache for this table.
1202                 prefetchRegionCache(tableName, row);
1203               }
1204             }
1205             location = getCachedLocation(tableName, row);
1206             if (location != null) {
1207               return location;
1208             }
1209           } else {
1210             // If we are not supposed to be using the cache, delete any existing cached location
1211             // so it won't interfere.
1212             forceDeleteCachedLocation(tableName, row);
1213           }
1214 
1215           // Query the meta region for the location of the meta region
1216           regionInfoRow =
1217               ProtobufUtil.getRowOrBefore(service, metaLocation.getRegionInfo().getRegionName(),
1218                 metaKey, HConstants.CATALOG_FAMILY);
1219 
1220           if (regionInfoRow == null) {
1221             throw new TableNotFoundException(tableName);
1222           }
1223 
1224           // convert the row result into the HRegionLocation we need!
1225           HRegionInfo regionInfo = MetaScanner.getHRegionInfo(regionInfoRow);
1226           if (regionInfo == null) {
1227             throw new IOException("HRegionInfo was null or empty in " +
1228               parentTable + ", row=" + regionInfoRow);
1229           }
1230 
1231           // possible we got a region of a different table...
1232           if (!regionInfo.getTable().equals(tableName)) {
1233             throw new TableNotFoundException(
1234                   "Table '" + tableName + "' was not found, got: " +
1235                   regionInfo.getTable() + ".");
1236           }
1237           if (regionInfo.isSplit()) {
1238             throw new RegionOfflineException("the only available region for" +
1239               " the required row is a split parent," +
1240               " the daughters should be online soon: " +
1241               regionInfo.getRegionNameAsString());
1242           }
1243           if (regionInfo.isOffline()) {
1244             throw new RegionOfflineException("the region is offline, could" +
1245               " be caused by a disable table call: " +
1246               regionInfo.getRegionNameAsString());
1247           }
1248 
1249           ServerName serverName = HRegionInfo.getServerName(regionInfoRow);
1250           if (serverName == null) {
1251             throw new NoServerForRegionException("No server address listed " +
1252               "in " + parentTable + " for region " +
1253               regionInfo.getRegionNameAsString() + " containing row " +
1254               Bytes.toStringBinary(row));
1255           }
1256 
1257           if (isDeadServer(serverName)){
1258             throw new RegionServerStoppedException("hbase:meta says the region "+
1259                 regionInfo.getRegionNameAsString()+" is managed by the server " + serverName +
1260                 ", but it is dead.");
1261           }
1262 
1263           // Instantiate the location
1264           location = new HRegionLocation(regionInfo, serverName,
1265             HRegionInfo.getSeqNumDuringOpen(regionInfoRow));
1266           cacheLocation(tableName, null, location);
1267           return location;
1268         } catch (TableNotFoundException e) {
1269           // if we got this error, probably means the table just plain doesn't
1270           // exist. rethrow the error immediately. this should always be coming
1271           // from the HTable constructor.
1272           throw e;
1273         } catch (IOException e) {
1274           ExceptionUtil.rethrowIfInterrupt(e);
1275 
1276           if (e instanceof RemoteException) {
1277             e = ((RemoteException)e).unwrapRemoteException();
1278           }
1279           if (tries < numTries - 1) {
1280             if (LOG.isDebugEnabled()) {
1281               LOG.debug("locateRegionInMeta parentTable=" +
1282                 parentTable + ", metaLocation=" +
1283                 ((metaLocation == null)? "null": "{" + metaLocation + "}") +
1284                 ", attempt=" + tries + " of " +
1285                 this.numTries + " failed; retrying after sleep of " +
1286                 ConnectionUtils.getPauseTime(this.pause, tries) + " because: " + e.getMessage());
1287             }
1288           } else {
1289             throw e;
1290           }
1291           // Only relocate the parent region if necessary
1292           if(!(e instanceof RegionOfflineException ||
1293               e instanceof NoServerForRegionException)) {
1294             relocateRegion(parentTable, metaKey);
1295           }
1296         }
1297         try{
1298           Thread.sleep(ConnectionUtils.getPauseTime(this.pause, tries));
1299         } catch (InterruptedException e) {
1300           throw new InterruptedIOException("Giving up trying to location region in " +
1301             "meta: thread is interrupted.");
1302         }
1303       }
1304     }
1305 
1306     /*
1307      * Search the cache for a location that fits our table and row key.
1308      * Return null if no suitable region is located.
1309      *
1310      * @param tableName
1311      * @param row
1312      * @return Null or region location found in cache.
1313      */
1314     HRegionLocation getCachedLocation(final TableName tableName,
1315         final byte [] row) {
1316       ConcurrentSkipListMap<byte[], HRegionLocation> tableLocations =
1317         getTableLocations(tableName);
1318 
1319       Entry<byte[], HRegionLocation> e = tableLocations.floorEntry(row);
1320       if (e == null) {
1321         return null;
1322       }
1323       HRegionLocation possibleRegion = e.getValue();
1324 
1325       // make sure that the end key is greater than the row we're looking
1326       // for, otherwise the row actually belongs in the next region, not
1327       // this one. the exception case is when the endkey is
1328       // HConstants.EMPTY_END_ROW, signifying that the region we're
1329       // checking is actually the last region in the table.
1330       byte[] endKey = possibleRegion.getRegionInfo().getEndKey();
1331       if (Bytes.equals(endKey, HConstants.EMPTY_END_ROW) ||
1332           tableName.getRowComparator().compareRows(
1333               endKey, 0, endKey.length, row, 0, row.length) > 0) {
1334         return possibleRegion;
1335       }
1336 
1337       // Passed all the way through, so we got nothing - complete cache miss
1338       return null;
1339     }
1340 
1341     /**
1342      * Delete a cached location, no matter what it is. Called when we were told to not use cache.
1343      * @param tableName tableName
1344      * @param row
1345      */
1346     void forceDeleteCachedLocation(final TableName tableName, final byte [] row) {
1347       HRegionLocation rl = null;
1348       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1349       // start to examine the cache. we can only do cache actions
1350       // if there's something in the cache for this table.
1351       rl = getCachedLocation(tableName, row);
1352       if (rl != null) {
1353         tableLocations.remove(rl.getRegionInfo().getStartKey());
1354       }
1355       if ((rl != null) && LOG.isDebugEnabled()) {
1356         LOG.debug("Removed " + rl.getHostname() + ":" + rl.getPort()
1357           + " as a location of " + rl.getRegionInfo().getRegionNameAsString() +
1358           " for tableName=" + tableName + " from cache");
1359       }
1360     }
1361 
1362     /*
1363      * Delete all cached entries of a table that maps to a specific location.
1364      */
1365     @Override
1366     public void clearCaches(final ServerName serverName) {
1367       if (!this.cachedServers.contains(serverName)) {
1368         return;
1369       }
1370 
1371       boolean deletedSomething = false;
1372       synchronized (this.cachedServers) {
1373         // We block here, because if there is an error on a server, it's likely that multiple
1374         //  threads will get the error  simultaneously. If there are hundreds of thousand of
1375         //  region location to check, it's better to do this only once. A better pattern would
1376         //  be to check if the server is dead when we get the region location.
1377         if (!this.cachedServers.contains(serverName)) {
1378           return;
1379         }
1380         for (Map<byte[], HRegionLocation> tableLocations : cachedRegionLocations.values()) {
1381           for (Entry<byte[], HRegionLocation> e : tableLocations.entrySet()) {
1382             HRegionLocation value = e.getValue();
1383             if (value != null
1384                 && serverName.equals(value.getServerName())) {
1385               tableLocations.remove(e.getKey());
1386               deletedSomething = true;
1387             }
1388           }
1389         }
1390         this.cachedServers.remove(serverName);
1391       }
1392       if (deletedSomething && LOG.isDebugEnabled()) {
1393         LOG.debug("Removed all cached region locations that map to " + serverName);
1394       }
1395     }
1396 
1397     /*
1398      * @param tableName
1399      * @return Map of cached locations for passed <code>tableName</code>
1400      */
1401     private ConcurrentSkipListMap<byte[], HRegionLocation> getTableLocations(
1402         final TableName tableName) {
1403       // find the map of cached locations for this table
1404       ConcurrentSkipListMap<byte[], HRegionLocation> result;
1405       result = this.cachedRegionLocations.get(tableName);
1406       // if tableLocations for this table isn't built yet, make one
1407       if (result == null) {
1408         result = new ConcurrentSkipListMap<byte[], HRegionLocation>(Bytes.BYTES_COMPARATOR);
1409         ConcurrentSkipListMap<byte[], HRegionLocation> old =
1410             this.cachedRegionLocations.putIfAbsent(tableName, result);
1411         if (old != null) {
1412           return old;
1413         }
1414       }
1415       return result;
1416     }
1417 
1418     @Override
1419     public void clearRegionCache() {
1420       this.cachedRegionLocations.clear();
1421       this.cachedServers.clear();
1422     }
1423 
1424     @Override
1425     public void clearRegionCache(final TableName tableName) {
1426       this.cachedRegionLocations.remove(tableName);
1427     }
1428 
1429     @Override
1430     public void clearRegionCache(final byte[] tableName) {
1431       clearRegionCache(TableName.valueOf(tableName));
1432     }
1433 
1434     /**
1435      * Put a newly discovered HRegionLocation into the cache.
1436      * @param tableName The table name.
1437      * @param source the source of the new location, if it's not coming from meta
1438      * @param location the new location
1439      */
1440     private void cacheLocation(final TableName tableName, final HRegionLocation source,
1441         final HRegionLocation location) {
1442       boolean isFromMeta = (source == null);
1443       byte [] startKey = location.getRegionInfo().getStartKey();
1444       ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1445       HRegionLocation oldLocation = tableLocations.putIfAbsent(startKey, location);
1446       boolean isNewCacheEntry = (oldLocation == null);
1447       if (isNewCacheEntry) {
1448         cachedServers.add(location.getServerName());
1449         return;
1450       }
1451       boolean updateCache;
1452       // If the server in cache sends us a redirect, assume it's always valid.
1453       if (oldLocation.equals(source)) {
1454         updateCache = true;
1455       } else {
1456         long newLocationSeqNum = location.getSeqNum();
1457         // Meta record is stale - some (probably the same) server has closed the region
1458         // with later seqNum and told us about the new location.
1459         boolean isStaleMetaRecord = isFromMeta && (oldLocation.getSeqNum() > newLocationSeqNum);
1460         // Same as above for redirect. However, in this case, if the number is equal to previous
1461         // record, the most common case is that first the region was closed with seqNum, and then
1462         // opened with the same seqNum; hence we will ignore the redirect.
1463         // There are so many corner cases with various combinations of opens and closes that
1464         // an additional counter on top of seqNum would be necessary to handle them all.
1465         boolean isStaleRedirect = !isFromMeta && (oldLocation.getSeqNum() >= newLocationSeqNum);
1466         boolean isStaleUpdate = (isStaleMetaRecord || isStaleRedirect);
1467         updateCache = (!isStaleUpdate);
1468       }
1469       if (updateCache) {
1470         tableLocations.replace(startKey, oldLocation, location);
1471         cachedServers.add(location.getServerName());
1472       }
1473     }
1474 
1475     // Map keyed by service name + regionserver to service stub implementation
1476     private final ConcurrentHashMap<String, Object> stubs =
1477       new ConcurrentHashMap<String, Object>();
1478     // Map of locks used creating service stubs per regionserver.
1479     private final ConcurrentHashMap<String, String> connectionLock =
1480       new ConcurrentHashMap<String, String>();
1481 
1482     /**
1483      * State of the MasterService connection/setup.
1484      */
1485     static class MasterServiceState {
1486       HConnection connection;
1487       MasterService.BlockingInterface stub;
1488       int userCount;
1489       long keepAliveUntil = Long.MAX_VALUE;
1490 
1491       MasterServiceState (final HConnection connection) {
1492         super();
1493         this.connection = connection;
1494       }
1495 
1496       @Override
1497       public String toString() {
1498         return "MasterService";
1499       }
1500 
1501       Object getStub() {
1502         return this.stub;
1503       }
1504 
1505       void clearStub() {
1506         this.stub = null;
1507       }
1508 
1509       boolean isMasterRunning() throws ServiceException {
1510         IsMasterRunningResponse response =
1511           this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1512         return response != null? response.getIsMasterRunning(): false;
1513       }
1514     }
1515 
1516     /**
1517      * Makes a client-side stub for master services. Sub-class to specialize.
1518      * Depends on hosting class so not static.  Exists so we avoid duplicating a bunch of code
1519      * when setting up the MasterMonitorService and MasterAdminService.
1520      */
1521     abstract class StubMaker {
1522       /**
1523        * Returns the name of the service stub being created.
1524        */
1525       protected abstract String getServiceName();
1526 
1527       /**
1528        * Make stub and cache it internal so can be used later doing the isMasterRunning call.
1529        * @param channel
1530        */
1531       protected abstract Object makeStub(final BlockingRpcChannel channel);
1532 
1533       /**
1534        * Once setup, check it works by doing isMasterRunning check.
1535        * @throws ServiceException
1536        */
1537       protected abstract void isMasterRunning() throws ServiceException;
1538 
1539       /**
1540        * Create a stub. Try once only.  It is not typed because there is no common type to
1541        * protobuf services nor their interfaces.  Let the caller do appropriate casting.
1542        * @return A stub for master services.
1543        * @throws IOException
1544        * @throws KeeperException
1545        * @throws ServiceException
1546        */
1547       private Object makeStubNoRetries() throws IOException, KeeperException, ServiceException {
1548         ZooKeeperKeepAliveConnection zkw;
1549         try {
1550           zkw = getKeepAliveZooKeeperWatcher();
1551         } catch (IOException e) {
1552           ExceptionUtil.rethrowIfInterrupt(e);
1553           throw new ZooKeeperConnectionException("Can't connect to ZooKeeper", e);
1554         }
1555         try {
1556           checkIfBaseNodeAvailable(zkw);
1557           ServerName sn = MasterAddressTracker.getMasterAddress(zkw);
1558           if (sn == null) {
1559             String msg = "ZooKeeper available but no active master location found";
1560             LOG.info(msg);
1561             throw new MasterNotRunningException(msg);
1562           }
1563           if (isDeadServer(sn)) {
1564             throw new MasterNotRunningException(sn + " is dead.");
1565           }
1566           // Use the security info interface name as our stub key
1567           String key = getStubKey(getServiceName(), sn.getHostAndPort());
1568           connectionLock.putIfAbsent(key, key);
1569           Object stub = null;
1570           synchronized (connectionLock.get(key)) {
1571             stub = stubs.get(key);
1572             if (stub == null) {
1573               BlockingRpcChannel channel = rpcClient.createBlockingRpcChannel(sn,
1574                 user, rpcTimeout);
1575               stub = makeStub(channel);
1576               isMasterRunning();
1577               stubs.put(key, stub);
1578             }
1579           }
1580           return stub;
1581         } finally {
1582           zkw.close();
1583         }
1584       }
1585 
1586       /**
1587        * Create a stub against the master.  Retry if necessary.
1588        * @return A stub to do <code>intf</code> against the master
1589        * @throws MasterNotRunningException
1590        */
1591       @edu.umd.cs.findbugs.annotations.SuppressWarnings (value="SWL_SLEEP_WITH_LOCK_HELD")
1592       Object makeStub() throws MasterNotRunningException {
1593         // The lock must be at the beginning to prevent multiple master creations
1594         //  (and leaks) in a multithread context
1595         synchronized (masterAndZKLock) {
1596           Exception exceptionCaught = null;
1597           Object stub = null;
1598           int tries = 0;
1599           while (!closed && stub == null) {
1600             tries++;
1601             try {
1602               stub = makeStubNoRetries();
1603             } catch (IOException e) {
1604               exceptionCaught = e;
1605             } catch (KeeperException e) {
1606               exceptionCaught = e;
1607             } catch (ServiceException e) {
1608               exceptionCaught = e;
1609             }
1610 
1611             if (exceptionCaught != null)
1612               // It failed. If it's not the last try, we're going to wait a little
1613               if (tries < numTries && !ExceptionUtil.isInterrupt(exceptionCaught)) {
1614                 // tries at this point is 1 or more; decrement to start from 0.
1615                 long pauseTime = ConnectionUtils.getPauseTime(pause, tries - 1);
1616                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1617                     " failed; retrying after sleep of " + pauseTime + ", exception=" +
1618                   exceptionCaught);
1619 
1620                 try {
1621                   Thread.sleep(pauseTime);
1622                 } catch (InterruptedException e) {
1623                   throw new MasterNotRunningException(
1624                       "Thread was interrupted while trying to connect to master.", e);
1625                 }
1626               } else {
1627                 // Enough tries, we stop now
1628                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1629                     " failed; no more retrying.", exceptionCaught);
1630                 throw new MasterNotRunningException(exceptionCaught);
1631               }
1632           }
1633 
1634           if (stub == null) {
1635             // implies this.closed true
1636             throw new MasterNotRunningException("Connection was closed while trying to get master");
1637           }
1638           return stub;
1639         }
1640       }
1641     }
1642 
1643     /**
1644      * Class to make a MasterServiceStubMaker stub.
1645      */
1646     class MasterServiceStubMaker extends StubMaker {
1647       private MasterService.BlockingInterface stub;
1648       @Override
1649       protected String getServiceName() {
1650         return MasterService.getDescriptor().getName();
1651       }
1652 
1653       @Override
1654       @edu.umd.cs.findbugs.annotations.SuppressWarnings("SWL_SLEEP_WITH_LOCK_HELD")
1655       MasterService.BlockingInterface makeStub() throws MasterNotRunningException {
1656         return (MasterService.BlockingInterface)super.makeStub();
1657       }
1658 
1659       @Override
1660       protected Object makeStub(BlockingRpcChannel channel) {
1661         this.stub = MasterService.newBlockingStub(channel);
1662         return this.stub;
1663       }
1664 
1665       @Override
1666       protected void isMasterRunning() throws ServiceException {
1667         this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1668       }
1669     }
1670 
1671     @Override
1672     public AdminService.BlockingInterface getAdmin(final ServerName serverName)
1673         throws IOException {
1674       return getAdmin(serverName, false);
1675     }
1676 
1677     @Override
1678     // Nothing is done w/ the 'master' parameter.  It is ignored.
1679     public AdminService.BlockingInterface getAdmin(final ServerName serverName,
1680       final boolean master)
1681     throws IOException {
1682       if (isDeadServer(serverName)) {
1683         throw new RegionServerStoppedException(serverName + " is dead.");
1684       }
1685       String key = getStubKey(AdminService.BlockingInterface.class.getName(),
1686         serverName.getHostAndPort());
1687       this.connectionLock.putIfAbsent(key, key);
1688       AdminService.BlockingInterface stub = null;
1689       synchronized (this.connectionLock.get(key)) {
1690         stub = (AdminService.BlockingInterface)this.stubs.get(key);
1691         if (stub == null) {
1692           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(serverName,
1693             user, this.rpcTimeout);
1694           stub = AdminService.newBlockingStub(channel);
1695           this.stubs.put(key, stub);
1696         }
1697       }
1698       return stub;
1699     }
1700 
1701     @Override
1702     public ClientService.BlockingInterface getClient(final ServerName sn)
1703     throws IOException {
1704       if (isDeadServer(sn)) {
1705         throw new RegionServerStoppedException(sn + " is dead.");
1706       }
1707       String key = getStubKey(ClientService.BlockingInterface.class.getName(), sn.getHostAndPort());
1708       this.connectionLock.putIfAbsent(key, key);
1709       ClientService.BlockingInterface stub = null;
1710       synchronized (this.connectionLock.get(key)) {
1711         stub = (ClientService.BlockingInterface)this.stubs.get(key);
1712         if (stub == null) {
1713           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(sn,
1714             user, this.rpcTimeout);
1715           stub = ClientService.newBlockingStub(channel);
1716           // In old days, after getting stub/proxy, we'd make a call.  We are not doing that here.
1717           // Just fail on first actual call rather than in here on setup.
1718           this.stubs.put(key, stub);
1719         }
1720       }
1721       return stub;
1722     }
1723 
1724     static String getStubKey(final String serviceName, final String rsHostnamePort) {
1725       return serviceName + "@" + rsHostnamePort;
1726     }
1727 
1728     private ZooKeeperKeepAliveConnection keepAliveZookeeper;
1729     private AtomicInteger keepAliveZookeeperUserCount = new AtomicInteger(0);
1730     private boolean canCloseZKW = true;
1731 
1732     // keepAlive time, in ms. No reason to make it configurable.
1733     private static final long keepAlive = 5 * 60 * 1000;
1734 
1735     /**
1736      * Retrieve a shared ZooKeeperWatcher. You must close it it once you've have finished with it.
1737      * @return The shared instance. Never returns null.
1738      */
1739     ZooKeeperKeepAliveConnection getKeepAliveZooKeeperWatcher()
1740       throws IOException {
1741       synchronized (masterAndZKLock) {
1742         if (keepAliveZookeeper == null) {
1743           if (this.closed) {
1744             throw new IOException(toString() + " closed");
1745           }
1746           // We don't check that our link to ZooKeeper is still valid
1747           // But there is a retry mechanism in the ZooKeeperWatcher itself
1748           keepAliveZookeeper = new ZooKeeperKeepAliveConnection(conf, this.toString(), this);
1749         }
1750         keepAliveZookeeperUserCount.incrementAndGet();
1751         keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1752         return keepAliveZookeeper;
1753       }
1754     }
1755 
1756     void releaseZooKeeperWatcher(final ZooKeeperWatcher zkw) {
1757       if (zkw == null){
1758         return;
1759       }
1760       synchronized (masterAndZKLock) {
1761         if (keepAliveZookeeperUserCount.decrementAndGet() <= 0 ){
1762           keepZooKeeperWatcherAliveUntil = System.currentTimeMillis() + keepAlive;
1763         }
1764       }
1765     }
1766 
1767     /**
1768      * Creates a Chore thread to check the connections to master & zookeeper
1769      *  and close them when they reach their closing time (
1770      *  {@link MasterServiceState#keepAliveUntil} and
1771      *  {@link #keepZooKeeperWatcherAliveUntil}). Keep alive time is
1772      *  managed by the release functions and the variable {@link #keepAlive}
1773      */
1774     private static class DelayedClosing extends Chore implements Stoppable {
1775       private HConnectionImplementation hci;
1776       Stoppable stoppable;
1777 
1778       private DelayedClosing(
1779         HConnectionImplementation hci, Stoppable stoppable){
1780         super(
1781           "ZooKeeperWatcher and Master delayed closing for connection "+hci,
1782           60*1000, // We check every minutes
1783           stoppable);
1784         this.hci = hci;
1785         this.stoppable = stoppable;
1786       }
1787 
1788       static DelayedClosing createAndStart(HConnectionImplementation hci){
1789         Stoppable stoppable = new Stoppable() {
1790               private volatile boolean isStopped = false;
1791               @Override public void stop(String why) { isStopped = true;}
1792               @Override public boolean isStopped() {return isStopped;}
1793             };
1794 
1795         return new DelayedClosing(hci, stoppable);
1796       }
1797 
1798       protected void closeMasterProtocol(MasterServiceState protocolState) {
1799         if (System.currentTimeMillis() > protocolState.keepAliveUntil) {
1800           hci.closeMasterService(protocolState);
1801           protocolState.keepAliveUntil = Long.MAX_VALUE;
1802         }
1803       }
1804 
1805       @Override
1806       protected void chore() {
1807         synchronized (hci.masterAndZKLock) {
1808           if (hci.canCloseZKW) {
1809             if (System.currentTimeMillis() >
1810               hci.keepZooKeeperWatcherAliveUntil) {
1811 
1812               hci.closeZooKeeperWatcher();
1813               hci.keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1814             }
1815           }
1816           closeMasterProtocol(hci.masterServiceState);
1817           closeMasterProtocol(hci.masterServiceState);
1818         }
1819       }
1820 
1821       @Override
1822       public void stop(String why) {
1823         stoppable.stop(why);
1824       }
1825 
1826       @Override
1827       public boolean isStopped() {
1828         return stoppable.isStopped();
1829       }
1830     }
1831 
1832     private void closeZooKeeperWatcher() {
1833       synchronized (masterAndZKLock) {
1834         if (keepAliveZookeeper != null) {
1835           LOG.info("Closing zookeeper sessionid=0x" +
1836             Long.toHexString(
1837               keepAliveZookeeper.getRecoverableZooKeeper().getSessionId()));
1838           keepAliveZookeeper.internalClose();
1839           keepAliveZookeeper = null;
1840         }
1841         keepAliveZookeeperUserCount.set(0);
1842       }
1843     }
1844 
1845     final MasterServiceState masterServiceState = new MasterServiceState(this);
1846 
1847     @Override
1848     public MasterService.BlockingInterface getMaster() throws MasterNotRunningException {
1849       return getKeepAliveMasterService();
1850     }
1851 
1852     private void resetMasterServiceState(final MasterServiceState mss) {
1853       mss.userCount++;
1854       mss.keepAliveUntil = Long.MAX_VALUE;
1855     }
1856 
1857     @Override
1858     public MasterKeepAliveConnection getKeepAliveMasterService()
1859     throws MasterNotRunningException {
1860       synchronized (masterAndZKLock) {
1861         if (!isKeepAliveMasterConnectedAndRunning(this.masterServiceState)) {
1862           MasterServiceStubMaker stubMaker = new MasterServiceStubMaker();
1863           this.masterServiceState.stub = stubMaker.makeStub();
1864         }
1865         resetMasterServiceState(this.masterServiceState);
1866       }
1867       // Ugly delegation just so we can add in a Close method.
1868       final MasterService.BlockingInterface stub = this.masterServiceState.stub;
1869       return new MasterKeepAliveConnection() {
1870         MasterServiceState mss = masterServiceState;
1871         @Override
1872         public AddColumnResponse addColumn(RpcController controller, AddColumnRequest request)
1873         throws ServiceException {
1874           return stub.addColumn(controller, request);
1875         }
1876 
1877         @Override
1878         public DeleteColumnResponse deleteColumn(RpcController controller,
1879             DeleteColumnRequest request)
1880         throws ServiceException {
1881           return stub.deleteColumn(controller, request);
1882         }
1883 
1884         @Override
1885         public ModifyColumnResponse modifyColumn(RpcController controller,
1886             ModifyColumnRequest request)
1887         throws ServiceException {
1888           return stub.modifyColumn(controller, request);
1889         }
1890 
1891         @Override
1892         public MoveRegionResponse moveRegion(RpcController controller,
1893             MoveRegionRequest request) throws ServiceException {
1894           return stub.moveRegion(controller, request);
1895         }
1896 
1897         @Override
1898         public DispatchMergingRegionsResponse dispatchMergingRegions(
1899             RpcController controller, DispatchMergingRegionsRequest request)
1900             throws ServiceException {
1901           return stub.dispatchMergingRegions(controller, request);
1902         }
1903 
1904         @Override
1905         public AssignRegionResponse assignRegion(RpcController controller,
1906             AssignRegionRequest request) throws ServiceException {
1907           return stub.assignRegion(controller, request);
1908         }
1909 
1910         @Override
1911         public UnassignRegionResponse unassignRegion(RpcController controller,
1912             UnassignRegionRequest request) throws ServiceException {
1913           return stub.unassignRegion(controller, request);
1914         }
1915 
1916         @Override
1917         public OfflineRegionResponse offlineRegion(RpcController controller,
1918             OfflineRegionRequest request) throws ServiceException {
1919           return stub.offlineRegion(controller, request);
1920         }
1921 
1922         @Override
1923         public DeleteTableResponse deleteTable(RpcController controller,
1924             DeleteTableRequest request) throws ServiceException {
1925           return stub.deleteTable(controller, request);
1926         }
1927 
1928         @Override
1929         public EnableTableResponse enableTable(RpcController controller,
1930             EnableTableRequest request) throws ServiceException {
1931           return stub.enableTable(controller, request);
1932         }
1933 
1934         @Override
1935         public DisableTableResponse disableTable(RpcController controller,
1936             DisableTableRequest request) throws ServiceException {
1937           return stub.disableTable(controller, request);
1938         }
1939 
1940         @Override
1941         public ModifyTableResponse modifyTable(RpcController controller,
1942             ModifyTableRequest request) throws ServiceException {
1943           return stub.modifyTable(controller, request);
1944         }
1945 
1946         @Override
1947         public CreateTableResponse createTable(RpcController controller,
1948             CreateTableRequest request) throws ServiceException {
1949           return stub.createTable(controller, request);
1950         }
1951 
1952         @Override
1953         public ShutdownResponse shutdown(RpcController controller,
1954             ShutdownRequest request) throws ServiceException {
1955           return stub.shutdown(controller, request);
1956         }
1957 
1958         @Override
1959         public StopMasterResponse stopMaster(RpcController controller,
1960             StopMasterRequest request) throws ServiceException {
1961           return stub.stopMaster(controller, request);
1962         }
1963 
1964         @Override
1965         public BalanceResponse balance(RpcController controller,
1966             BalanceRequest request) throws ServiceException {
1967           return stub.balance(controller, request);
1968         }
1969 
1970         @Override
1971         public SetBalancerRunningResponse setBalancerRunning(
1972             RpcController controller, SetBalancerRunningRequest request)
1973             throws ServiceException {
1974           return stub.setBalancerRunning(controller, request);
1975         }
1976 
1977         @Override
1978         public RunCatalogScanResponse runCatalogScan(RpcController controller,
1979             RunCatalogScanRequest request) throws ServiceException {
1980           return stub.runCatalogScan(controller, request);
1981         }
1982 
1983         @Override
1984         public EnableCatalogJanitorResponse enableCatalogJanitor(
1985             RpcController controller, EnableCatalogJanitorRequest request)
1986             throws ServiceException {
1987           return stub.enableCatalogJanitor(controller, request);
1988         }
1989 
1990         @Override
1991         public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(
1992             RpcController controller, IsCatalogJanitorEnabledRequest request)
1993             throws ServiceException {
1994           return stub.isCatalogJanitorEnabled(controller, request);
1995         }
1996 
1997         @Override
1998         public CoprocessorServiceResponse execMasterService(
1999             RpcController controller, CoprocessorServiceRequest request)
2000             throws ServiceException {
2001           return stub.execMasterService(controller, request);
2002         }
2003 
2004         @Override
2005         public SnapshotResponse snapshot(RpcController controller,
2006             SnapshotRequest request) throws ServiceException {
2007           return stub.snapshot(controller, request);
2008         }
2009 
2010         @Override
2011         public GetCompletedSnapshotsResponse getCompletedSnapshots(
2012             RpcController controller, GetCompletedSnapshotsRequest request)
2013             throws ServiceException {
2014           return stub.getCompletedSnapshots(controller, request);
2015         }
2016 
2017         @Override
2018         public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2019             DeleteSnapshotRequest request) throws ServiceException {
2020           return stub.deleteSnapshot(controller, request);
2021         }
2022 
2023         @Override
2024         public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2025             IsSnapshotDoneRequest request) throws ServiceException {
2026           return stub.isSnapshotDone(controller, request);
2027         }
2028 
2029         @Override
2030         public RestoreSnapshotResponse restoreSnapshot(
2031             RpcController controller, RestoreSnapshotRequest request)
2032             throws ServiceException {
2033           return stub.restoreSnapshot(controller, request);
2034         }
2035 
2036         @Override
2037         public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(
2038             RpcController controller, IsRestoreSnapshotDoneRequest request)
2039             throws ServiceException {
2040           return stub.isRestoreSnapshotDone(controller, request);
2041         }
2042 
2043         @Override
2044         public ExecProcedureResponse execProcedure(
2045             RpcController controller, ExecProcedureRequest request)
2046             throws ServiceException {
2047           return stub.execProcedure(controller, request);
2048         }
2049 
2050         @Override
2051         public IsProcedureDoneResponse isProcedureDone(RpcController controller,
2052             IsProcedureDoneRequest request) throws ServiceException {
2053           return stub.isProcedureDone(controller, request);
2054         }
2055 
2056         @Override
2057         public IsMasterRunningResponse isMasterRunning(
2058             RpcController controller, IsMasterRunningRequest request)
2059             throws ServiceException {
2060           return stub.isMasterRunning(controller, request);
2061         }
2062 
2063         @Override
2064         public ModifyNamespaceResponse modifyNamespace(RpcController controller,
2065             ModifyNamespaceRequest request)
2066         throws ServiceException {
2067           return stub.modifyNamespace(controller, request);
2068         }
2069 
2070         @Override
2071         public CreateNamespaceResponse createNamespace(RpcController controller, CreateNamespaceRequest request) throws ServiceException {
2072           return stub.createNamespace(controller, request);
2073         }
2074 
2075         @Override
2076         public DeleteNamespaceResponse deleteNamespace(RpcController controller, DeleteNamespaceRequest request) throws ServiceException {
2077           return stub.deleteNamespace(controller, request);
2078         }
2079 
2080         @Override
2081         public GetNamespaceDescriptorResponse getNamespaceDescriptor(RpcController controller, GetNamespaceDescriptorRequest request) throws ServiceException {
2082           return stub.getNamespaceDescriptor(controller, request);
2083         }
2084 
2085         @Override
2086         public ListNamespaceDescriptorsResponse listNamespaceDescriptors(RpcController controller, ListNamespaceDescriptorsRequest request) throws ServiceException {
2087           return stub.listNamespaceDescriptors(controller, request);
2088         }
2089 
2090         @Override
2091         public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(RpcController controller, ListTableDescriptorsByNamespaceRequest request) throws ServiceException {
2092           return stub.listTableDescriptorsByNamespace(controller, request);
2093         }
2094 
2095         @Override
2096         public ListTableNamesByNamespaceResponse listTableNamesByNamespace(RpcController controller,
2097               ListTableNamesByNamespaceRequest request) throws ServiceException {
2098           return stub.listTableNamesByNamespace(controller, request);
2099         }
2100 
2101         @Override
2102         public void close() {
2103           release(this.mss);
2104         }
2105 
2106         @Override
2107         public GetSchemaAlterStatusResponse getSchemaAlterStatus(
2108             RpcController controller, GetSchemaAlterStatusRequest request)
2109             throws ServiceException {
2110           return stub.getSchemaAlterStatus(controller, request);
2111         }
2112 
2113         @Override
2114         public GetTableDescriptorsResponse getTableDescriptors(
2115             RpcController controller, GetTableDescriptorsRequest request)
2116             throws ServiceException {
2117           return stub.getTableDescriptors(controller, request);
2118         }
2119 
2120         @Override
2121         public GetTableNamesResponse getTableNames(
2122             RpcController controller, GetTableNamesRequest request)
2123             throws ServiceException {
2124           return stub.getTableNames(controller, request);
2125         }
2126 
2127         @Override
2128         public GetClusterStatusResponse getClusterStatus(
2129             RpcController controller, GetClusterStatusRequest request)
2130             throws ServiceException {
2131           return stub.getClusterStatus(controller, request);
2132         }
2133 
2134         @Override
2135         public TruncateTableResponse truncateTable(RpcController controller,
2136             TruncateTableRequest request) throws ServiceException {
2137           return stub.truncateTable(controller, request);
2138         }
2139       };
2140     }
2141 
2142 
2143     private static void release(MasterServiceState mss) {
2144       if (mss != null && mss.connection != null) {
2145         ((HConnectionImplementation)mss.connection).releaseMaster(mss);
2146       }
2147     }
2148 
2149     private boolean isKeepAliveMasterConnectedAndRunning(MasterServiceState mss) {
2150       if (mss.getStub() == null){
2151         return false;
2152       }
2153       try {
2154         return mss.isMasterRunning();
2155       } catch (UndeclaredThrowableException e) {
2156         // It's somehow messy, but we can receive exceptions such as
2157         //  java.net.ConnectException but they're not declared. So we catch it...
2158         LOG.info("Master connection is not running anymore", e.getUndeclaredThrowable());
2159         return false;
2160       } catch (ServiceException se) {
2161         LOG.warn("Checking master connection", se);
2162         return false;
2163       }
2164     }
2165 
2166     void releaseMaster(MasterServiceState mss) {
2167       if (mss.getStub() == null) return;
2168       synchronized (masterAndZKLock) {
2169         --mss.userCount;
2170         if (mss.userCount <= 0) {
2171           mss.keepAliveUntil = System.currentTimeMillis() + keepAlive;
2172         }
2173       }
2174     }
2175 
2176     private void closeMasterService(MasterServiceState mss) {
2177       if (mss.getStub() != null) {
2178         LOG.info("Closing master protocol: " + mss);
2179         mss.clearStub();
2180       }
2181       mss.userCount = 0;
2182     }
2183 
2184     /**
2185      * Immediate close of the shared master. Can be by the delayed close or when closing the
2186      * connection itself.
2187      */
2188     private void closeMaster() {
2189       synchronized (masterAndZKLock) {
2190         closeMasterService(masterServiceState);
2191       }
2192     }
2193 
2194     void updateCachedLocation(HRegionInfo hri, HRegionLocation source,
2195                               ServerName serverName, long seqNum) {
2196       HRegionLocation newHrl = new HRegionLocation(hri, serverName, seqNum);
2197       cacheLocation(hri.getTable(), source, newHrl);
2198     }
2199 
2200    /**
2201     * Deletes the cached location of the region if necessary, based on some error from source.
2202     * @param hri The region in question.
2203     * @param source The source of the error that prompts us to invalidate cache.
2204     */
2205    void deleteCachedLocation(HRegionInfo hri, HRegionLocation source) {
2206      ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(hri.getTable());
2207      tableLocations.remove(hri.getStartKey(), source);
2208    }
2209 
2210     @Override
2211     public void deleteCachedRegionLocation(final HRegionLocation location) {
2212       if (location == null) {
2213         return;
2214       }
2215 
2216       HRegionLocation removedLocation;
2217       TableName tableName = location.getRegionInfo().getTable();
2218       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
2219       removedLocation = tableLocations.remove(location.getRegionInfo().getStartKey());
2220       if (LOG.isDebugEnabled() && removedLocation != null) {
2221         LOG.debug("Removed " +
2222             location.getRegionInfo().getRegionNameAsString() +
2223             " for tableName=" + tableName +
2224             " from cache");
2225       }
2226     }
2227 
2228     /**
2229      * Update the location with the new value (if the exception is a RegionMovedException)
2230      * or delete it from the cache. Does nothing if we can be sure from the exception that
2231      * the location is still accurate, or if the cache has already been updated.
2232      * @param exception an object (to simplify user code) on which we will try to find a nested
2233      *                  or wrapped or both RegionMovedException
2234      * @param source server that is the source of the location update.
2235      */
2236     @Override
2237     public void updateCachedLocations(final TableName tableName, byte[] rowkey,
2238       final Object exception, final HRegionLocation source) {
2239       if (rowkey == null || tableName == null) {
2240         LOG.warn("Coding error, see method javadoc. row=" + (rowkey == null ? "null" : rowkey) +
2241             ", tableName=" + (tableName == null ? "null" : tableName));
2242         return;
2243       }
2244 
2245       if (source == null || source.getServerName() == null){
2246         // This should not happen, but let's secure ourselves.
2247         return;
2248       }
2249 
2250       // Is it something we have already updated?
2251       final HRegionLocation oldLocation = getCachedLocation(tableName, rowkey);
2252       if (oldLocation == null || !source.getServerName().equals(oldLocation.getServerName())) {
2253         // There is no such location in the cache (it's been removed already) or
2254         // the cache has already been refreshed with a different location.  => nothing to do
2255         return;
2256       }
2257 
2258       HRegionInfo regionInfo = oldLocation.getRegionInfo();
2259       Throwable cause = findException(exception);
2260       if (cause != null) {
2261         if (cause instanceof RegionTooBusyException || cause instanceof RegionOpeningException) {
2262           // We know that the region is still on this region server
2263           return;
2264         }
2265 
2266         if (cause instanceof RegionMovedException) {
2267           RegionMovedException rme = (RegionMovedException) cause;
2268           if (LOG.isTraceEnabled()) {
2269             LOG.trace("Region " + regionInfo.getRegionNameAsString() + " moved to " +
2270                 rme.getHostname() + ":" + rme.getPort() +
2271                 " according to " + source.getHostnamePort());
2272           }
2273           // We know that the region is not anymore on this region server, but we know
2274           //  the new location.
2275           updateCachedLocation(
2276               regionInfo, source, rme.getServerName(), rme.getLocationSeqNum());
2277           return;
2278         }
2279       }
2280 
2281       // If we're here, it means that can cannot be sure about the location, so we remove it from
2282       //  the cache.
2283       deleteCachedLocation(regionInfo, source);
2284     }
2285 
2286     @Override
2287     public void updateCachedLocations(final byte[] tableName, byte[] rowkey,
2288       final Object exception, final HRegionLocation source) {
2289       updateCachedLocations(TableName.valueOf(tableName), rowkey, exception, source);
2290     }
2291 
2292     @Override
2293     @Deprecated
2294     public void processBatch(List<? extends Row> list,
2295         final TableName tableName,
2296         ExecutorService pool,
2297         Object[] results) throws IOException, InterruptedException {
2298       // This belongs in HTable!!! Not in here.  St.Ack
2299 
2300       // results must be the same size as list
2301       if (results.length != list.size()) {
2302         throw new IllegalArgumentException(
2303           "argument results must be the same size as argument list");
2304       }
2305       processBatchCallback(list, tableName, pool, results, null);
2306     }
2307 
2308     @Override
2309     @Deprecated
2310     public void processBatch(List<? extends Row> list,
2311         final byte[] tableName,
2312         ExecutorService pool,
2313         Object[] results) throws IOException, InterruptedException {
2314       processBatch(list, TableName.valueOf(tableName), pool, results);
2315     }
2316 
2317     /**
2318      * Send the queries in parallel on the different region servers. Retries on failures.
2319      * If the method returns it means that there is no error, and the 'results' array will
2320      * contain no exception. On error, an exception is thrown, and the 'results' array will
2321      * contain results and exceptions.
2322      * @deprecated since 0.96 - Use {@link HTable#processBatchCallback} instead
2323      */
2324     @Override
2325     @Deprecated
2326     public <R> void processBatchCallback(
2327       List<? extends Row> list,
2328       TableName tableName,
2329       ExecutorService pool,
2330       Object[] results,
2331       Batch.Callback<R> callback)
2332       throws IOException, InterruptedException {
2333 
2334       // To fulfill the original contract, we have a special callback. This callback
2335       //  will set the results in the Object array.
2336       ObjectResultFiller<R> cb = new ObjectResultFiller<R>(results, callback);
2337       AsyncProcess<?> asyncProcess = createAsyncProcess(tableName, pool, cb, conf);
2338 
2339       // We're doing a submit all. This way, the originalIndex will match the initial list.
2340       asyncProcess.submitAll(list);
2341       asyncProcess.waitUntilDone();
2342 
2343       if (asyncProcess.hasError()) {
2344         throw asyncProcess.getErrors();
2345       }
2346     }
2347 
2348     @Override
2349     @Deprecated
2350     public <R> void processBatchCallback(
2351       List<? extends Row> list,
2352       byte[] tableName,
2353       ExecutorService pool,
2354       Object[] results,
2355       Batch.Callback<R> callback)
2356       throws IOException, InterruptedException {
2357       processBatchCallback(list, TableName.valueOf(tableName), pool, results, callback);
2358     }
2359 
2360     // For tests.
2361     protected <R> AsyncProcess createAsyncProcess(TableName tableName, ExecutorService pool,
2362            AsyncProcess.AsyncProcessCallback<R> callback, Configuration conf) {
2363       return new AsyncProcess<R>(this, tableName, pool, callback, conf,
2364           RpcRetryingCallerFactory.instantiate(conf), RpcControllerFactory.instantiate(conf));
2365     }
2366 
2367 
2368     /**
2369      * Fill the result array for the interfaces using it.
2370      */
2371     private static class ObjectResultFiller<Res>
2372         implements AsyncProcess.AsyncProcessCallback<Res> {
2373 
2374       private final Object[] results;
2375       private Batch.Callback<Res> callback;
2376 
2377       ObjectResultFiller(Object[] results, Batch.Callback<Res> callback) {
2378         this.results = results;
2379         this.callback = callback;
2380       }
2381 
2382       @Override
2383       public void success(int pos, byte[] region, Row row, Res result) {
2384         assert pos < results.length;
2385         results[pos] = result;
2386         if (callback != null) {
2387           callback.update(region, row.getRow(), result);
2388         }
2389       }
2390 
2391       @Override
2392       public boolean failure(int pos, byte[] region, Row row, Throwable t) {
2393         assert pos < results.length;
2394         results[pos] = t;
2395         //Batch.Callback<Res> was not called on failure in 0.94. We keep this.
2396         return true; // we want to have this failure in the failures list.
2397       }
2398 
2399       @Override
2400       public boolean retriableFailure(int originalIndex, Row row, byte[] region,
2401                                       Throwable exception) {
2402         return true; // we retry
2403       }
2404     }
2405 
2406 
2407     /*
2408      * Return the number of cached region for a table. It will only be called
2409      * from a unit test.
2410      */
2411     int getNumberOfCachedRegionLocations(final TableName tableName) {
2412       Map<byte[], HRegionLocation> tableLocs = this.cachedRegionLocations.get(tableName);
2413       if (tableLocs == null) {
2414         return 0;
2415       }
2416       return tableLocs.values().size();
2417     }
2418 
2419     /**
2420      * Check the region cache to see whether a region is cached yet or not.
2421      * Called by unit tests.
2422      * @param tableName tableName
2423      * @param row row
2424      * @return Region cached or not.
2425      */
2426     boolean isRegionCached(TableName tableName, final byte[] row) {
2427       HRegionLocation location = getCachedLocation(tableName, row);
2428       return location != null;
2429     }
2430 
2431     @Override
2432     public void setRegionCachePrefetch(final TableName tableName,
2433         final boolean enable) {
2434       if (!enable) {
2435         regionCachePrefetchDisabledTables.add(Bytes.mapKey(tableName.getName()));
2436       }
2437       else {
2438         regionCachePrefetchDisabledTables.remove(Bytes.mapKey(tableName.getName()));
2439       }
2440     }
2441 
2442     @Override
2443     public void setRegionCachePrefetch(final byte[] tableName,
2444         final boolean enable) {
2445       setRegionCachePrefetch(TableName.valueOf(tableName), enable);
2446     }
2447 
2448     @Override
2449     public boolean getRegionCachePrefetch(TableName tableName) {
2450       return usePrefetch &&
2451           !regionCachePrefetchDisabledTables.contains(Bytes.mapKey(tableName.getName()));
2452     }
2453 
2454     @Override
2455     public boolean getRegionCachePrefetch(byte[] tableName) {
2456       return getRegionCachePrefetch(TableName.valueOf(tableName));
2457     }
2458 
2459     @Override
2460     public void abort(final String msg, Throwable t) {
2461       if (t instanceof KeeperException.SessionExpiredException
2462         && keepAliveZookeeper != null) {
2463         synchronized (masterAndZKLock) {
2464           if (keepAliveZookeeper != null) {
2465             LOG.warn("This client just lost it's session with ZooKeeper," +
2466               " closing it." +
2467               " It will be recreated next time someone needs it", t);
2468             closeZooKeeperWatcher();
2469           }
2470         }
2471       } else {
2472         if (t != null) {
2473           LOG.fatal(msg, t);
2474         } else {
2475           LOG.fatal(msg);
2476         }
2477         this.aborted = true;
2478         close();
2479         this.closed = true;
2480       }
2481     }
2482 
2483     @Override
2484     public boolean isClosed() {
2485       return this.closed;
2486     }
2487 
2488     @Override
2489     public boolean isAborted(){
2490       return this.aborted;
2491     }
2492 
2493     @Override
2494     public int getCurrentNrHRS() throws IOException {
2495       return this.registry.getCurrentNrHRS();
2496     }
2497 
2498     /**
2499      * Increment this client's reference count.
2500      */
2501     void incCount() {
2502       ++refCount;
2503     }
2504 
2505     /**
2506      * Decrement this client's reference count.
2507      */
2508     void decCount() {
2509       if (refCount > 0) {
2510         --refCount;
2511       }
2512     }
2513 
2514     /**
2515      * Return if this client has no reference
2516      *
2517      * @return true if this client has no reference; false otherwise
2518      */
2519     boolean isZeroReference() {
2520       return refCount == 0;
2521     }
2522 
2523     void internalClose() {
2524       if (this.closed) {
2525         return;
2526       }
2527       delayedClosing.stop("Closing connection");
2528       closeMaster();
2529       shutdownBatchPool();
2530       this.closed = true;
2531       closeZooKeeperWatcher();
2532       this.stubs.clear();
2533       if (clusterStatusListener != null) {
2534         clusterStatusListener.close();
2535       }
2536       if (rpcClient != null) {
2537         rpcClient.stop();
2538       }
2539     }
2540 
2541     @Override
2542     public void close() {
2543       if (managed) {
2544         if (aborted) {
2545           HConnectionManager.deleteStaleConnection(this);
2546         } else {
2547           HConnectionManager.deleteConnection(this, false);
2548         }
2549       } else {
2550         internalClose();
2551       }
2552     }
2553 
2554     /**
2555      * Close the connection for good, regardless of what the current value of
2556      * {@link #refCount} is. Ideally, {@link #refCount} should be zero at this
2557      * point, which would be the case if all of its consumers close the
2558      * connection. However, on the off chance that someone is unable to close
2559      * the connection, perhaps because it bailed out prematurely, the method
2560      * below will ensure that this {@link HConnection} instance is cleaned up.
2561      * Caveat: The JVM may take an unknown amount of time to call finalize on an
2562      * unreachable object, so our hope is that every consumer cleans up after
2563      * itself, like any good citizen.
2564      */
2565     @Override
2566     protected void finalize() throws Throwable {
2567       super.finalize();
2568       // Pretend as if we are about to release the last remaining reference
2569       refCount = 1;
2570       close();
2571     }
2572 
2573     @Override
2574     public HTableDescriptor[] listTables() throws IOException {
2575       MasterKeepAliveConnection master = getKeepAliveMasterService();
2576       try {
2577         GetTableDescriptorsRequest req =
2578           RequestConverter.buildGetTableDescriptorsRequest((List<TableName>)null);
2579         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2580       } catch (ServiceException se) {
2581         throw ProtobufUtil.getRemoteException(se);
2582       } finally {
2583         master.close();
2584       }
2585     }
2586 
2587     @Override
2588     public String[] getTableNames() throws IOException {
2589       TableName[] tableNames = listTableNames();
2590       String result[] = new String[tableNames.length];
2591       for (int i = 0; i < tableNames.length; i++) {
2592         result[i] = tableNames[i].getNameAsString();
2593       }
2594       return result;
2595     }
2596 
2597     @Override
2598     public TableName[] listTableNames() throws IOException {
2599       MasterKeepAliveConnection master = getKeepAliveMasterService();
2600       try {
2601         return ProtobufUtil.getTableNameArray(master.getTableNames(null,
2602             GetTableNamesRequest.newBuilder().build())
2603           .getTableNamesList());
2604       } catch (ServiceException se) {
2605         throw ProtobufUtil.getRemoteException(se);
2606       } finally {
2607         master.close();
2608       }
2609     }
2610 
2611     @Override
2612     public HTableDescriptor[] getHTableDescriptorsByTableName(
2613         List<TableName> tableNames) throws IOException {
2614       if (tableNames == null || tableNames.isEmpty()) return new HTableDescriptor[0];
2615       MasterKeepAliveConnection master = getKeepAliveMasterService();
2616       try {
2617         GetTableDescriptorsRequest req =
2618           RequestConverter.buildGetTableDescriptorsRequest(tableNames);
2619         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2620       } catch (ServiceException se) {
2621         throw ProtobufUtil.getRemoteException(se);
2622       } finally {
2623         master.close();
2624       }
2625     }
2626 
2627     @Override
2628     public HTableDescriptor[] getHTableDescriptors(
2629         List<String> names) throws IOException {
2630       List<TableName> tableNames = new ArrayList(names.size());
2631       for(String name : names) {
2632         tableNames.add(TableName.valueOf(name));
2633       }
2634 
2635       return getHTableDescriptorsByTableName(tableNames);
2636     }
2637 
2638     @Override
2639     public NonceGenerator getNonceGenerator() {
2640       return this.nonceGenerator;
2641     }
2642 
2643     /**
2644      * Connects to the master to get the table descriptor.
2645      * @param tableName table name
2646      * @return
2647      * @throws IOException if the connection to master fails or if the table
2648      *  is not found.
2649      */
2650     @Override
2651     public HTableDescriptor getHTableDescriptor(final TableName tableName)
2652     throws IOException {
2653       if (tableName == null) return null;
2654       MasterKeepAliveConnection master = getKeepAliveMasterService();
2655       GetTableDescriptorsResponse htds;
2656       try {
2657         GetTableDescriptorsRequest req =
2658           RequestConverter.buildGetTableDescriptorsRequest(tableName);
2659         htds = master.getTableDescriptors(null, req);
2660       } catch (ServiceException se) {
2661         throw ProtobufUtil.getRemoteException(se);
2662       } finally {
2663         master.close();
2664       }
2665       if (!htds.getTableSchemaList().isEmpty()) {
2666         return HTableDescriptor.convert(htds.getTableSchemaList().get(0));
2667       }
2668       throw new TableNotFoundException(tableName.getNameAsString());
2669     }
2670 
2671     @Override
2672     public HTableDescriptor getHTableDescriptor(final byte[] tableName)
2673     throws IOException {
2674       return getHTableDescriptor(TableName.valueOf(tableName));
2675     }
2676   }
2677 
2678   /**
2679    * The record of errors for servers.
2680    */
2681   static class ServerErrorTracker {
2682     // We need a concurrent map here, as we could have multiple threads updating it in parallel.
2683     private final ConcurrentMap<HRegionLocation, ServerErrors> errorsByServer =
2684         new ConcurrentHashMap<HRegionLocation, ServerErrors>();
2685     private final long canRetryUntil;
2686     private final int maxRetries;
2687     private final String startTrackingTime;
2688 
2689     public ServerErrorTracker(long timeout, int maxRetries) {
2690       this.maxRetries = maxRetries;
2691       this.canRetryUntil = EnvironmentEdgeManager.currentTimeMillis() + timeout;
2692       this.startTrackingTime = new Date().toString();
2693     }
2694 
2695     /**
2696      * We stop to retry when we have exhausted BOTH the number of retries and the time allocated.
2697      */
2698     boolean canRetryMore(int numRetry) {
2699       // If there is a single try we must not take into account the time.
2700       return numRetry < maxRetries || (maxRetries > 1 &&
2701           EnvironmentEdgeManager.currentTimeMillis() < this.canRetryUntil);
2702     }
2703 
2704     /**
2705      * Calculates the back-off time for a retrying request to a particular server.
2706      *
2707      * @param server    The server in question.
2708      * @param basePause The default hci pause.
2709      * @return The time to wait before sending next request.
2710      */
2711     long calculateBackoffTime(HRegionLocation server, long basePause) {
2712       long result;
2713       ServerErrors errorStats = errorsByServer.get(server);
2714       if (errorStats != null) {
2715         result = ConnectionUtils.getPauseTime(basePause, errorStats.retries.get());
2716       } else {
2717         result = 0; // yes, if the server is not in our list we don't wait before retrying.
2718       }
2719       return result;
2720     }
2721 
2722     /**
2723      * Reports that there was an error on the server to do whatever bean-counting necessary.
2724      *
2725      * @param server The server in question.
2726      */
2727     void reportServerError(HRegionLocation server) {
2728       ServerErrors errors = errorsByServer.get(server);
2729       if (errors != null) {
2730         errors.addError();
2731       } else {
2732         errors = errorsByServer.putIfAbsent(server, new ServerErrors());
2733         if (errors != null){
2734           errors.addError();
2735         }
2736       }
2737     }
2738 
2739     String getStartTrackingTime() {
2740       return startTrackingTime;
2741     }
2742 
2743     /**
2744      * The record of errors for a server.
2745      */
2746     private static class ServerErrors {
2747       public final AtomicInteger retries = new AtomicInteger(0);
2748 
2749       public void addError() {
2750         retries.incrementAndGet();
2751       }
2752     }
2753   }
2754 
2755   /**
2756    * Look for an exception we know in the remote exception:
2757    * - hadoop.ipc wrapped exceptions
2758    * - nested exceptions
2759    *
2760    * Looks for: RegionMovedException / RegionOpeningException / RegionTooBusyException
2761    * @return null if we didn't find the exception, the exception otherwise.
2762    */
2763   public static Throwable findException(Object exception) {
2764     if (exception == null || !(exception instanceof Throwable)) {
2765       return null;
2766     }
2767     Throwable cur = (Throwable) exception;
2768     while (cur != null) {
2769       if (cur instanceof RegionMovedException || cur instanceof RegionOpeningException
2770           || cur instanceof RegionTooBusyException) {
2771         return cur;
2772       }
2773       if (cur instanceof RemoteException) {
2774         RemoteException re = (RemoteException) cur;
2775         cur = re.unwrapRemoteException(
2776             RegionOpeningException.class, RegionMovedException.class,
2777             RegionTooBusyException.class);
2778         if (cur == null) {
2779           cur = re.unwrapRemoteException();
2780         }
2781         // unwrapRemoteException can return the exception given as a parameter when it cannot
2782         //  unwrap it. In this case, there is no need to look further
2783         // noinspection ObjectEquality
2784         if (cur == re) {
2785           return null;
2786         }
2787       } else {
2788         cur = cur.getCause();
2789       }
2790     }
2791 
2792     return null;
2793   }
2794 
2795   /**
2796    * Set the number of retries to use serverside when trying to communicate
2797    * with another server over {@link HConnection}.  Used updating catalog
2798    * tables, etc.  Call this method before we create any Connections.
2799    * @param c The Configuration instance to set the retries into.
2800    * @param log Used to log what we set in here.
2801    */
2802   public static void setServerSideHConnectionRetries(final Configuration c, final String sn,
2803       final Log log) {
2804     int hcRetries = c.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
2805       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
2806     // Go big.  Multiply by 10.  If we can't get to meta after this many retries
2807     // then something seriously wrong.
2808     int serversideMultiplier = c.getInt("hbase.client.serverside.retries.multiplier", 10);
2809     int retries = hcRetries * serversideMultiplier;
2810     c.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
2811     log.debug(sn + " HConnection server-to-server retries=" + retries);
2812   }
2813 }