View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import static org.apache.hadoop.hbase.client.MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY;
22  
23  import java.io.Closeable;
24  import java.io.IOException;
25  import java.io.InterruptedIOException;
26  import java.lang.reflect.Constructor;
27  import java.lang.reflect.UndeclaredThrowableException;
28  import java.net.SocketException;
29  import java.util.ArrayList;
30  import java.util.Date;
31  import java.util.HashSet;
32  import java.util.LinkedHashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.Map.Entry;
36  import java.util.NavigableMap;
37  import java.util.Set;
38  import java.util.concurrent.ConcurrentHashMap;
39  import java.util.concurrent.ConcurrentMap;
40  import java.util.concurrent.ConcurrentSkipListMap;
41  import java.util.concurrent.ConcurrentSkipListSet;
42  import java.util.concurrent.CopyOnWriteArraySet;
43  import java.util.concurrent.ExecutorService;
44  import java.util.concurrent.LinkedBlockingQueue;
45  import java.util.concurrent.ThreadPoolExecutor;
46  import java.util.concurrent.TimeUnit;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  import java.util.concurrent.atomic.AtomicInteger;
49  
50  import org.apache.commons.logging.Log;
51  import org.apache.commons.logging.LogFactory;
52  import org.apache.hadoop.hbase.classification.InterfaceAudience;
53  import org.apache.hadoop.hbase.classification.InterfaceStability;
54  import org.apache.hadoop.conf.Configuration;
55  import org.apache.hadoop.hbase.Chore;
56  import org.apache.hadoop.hbase.HBaseConfiguration;
57  import org.apache.hadoop.hbase.HConstants;
58  import org.apache.hadoop.hbase.HRegionInfo;
59  import org.apache.hadoop.hbase.HRegionLocation;
60  import org.apache.hadoop.hbase.HTableDescriptor;
61  import org.apache.hadoop.hbase.MasterNotRunningException;
62  import org.apache.hadoop.hbase.RegionTooBusyException;
63  import org.apache.hadoop.hbase.ServerName;
64  import org.apache.hadoop.hbase.Stoppable;
65  import org.apache.hadoop.hbase.TableName;
66  import org.apache.hadoop.hbase.TableNotEnabledException;
67  import org.apache.hadoop.hbase.TableNotFoundException;
68  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
69  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
70  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
71  import org.apache.hadoop.hbase.client.backoff.ClientBackoffPolicy;
72  import org.apache.hadoop.hbase.client.backoff.ClientBackoffPolicyFactory;
73  import org.apache.hadoop.hbase.client.coprocessor.Batch;
74  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
75  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
76  import org.apache.hadoop.hbase.ipc.RpcClient;
77  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
78  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
79  import org.apache.hadoop.hbase.protobuf.RequestConverter;
80  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
81  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService;
82  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
83  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
84  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
85  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
86  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionRequest;
87  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
88  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
89  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
90  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceRequest;
91  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceResponse;
92  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableRequest;
93  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
94  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnRequest;
95  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
96  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceRequest;
97  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceResponse;
98  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotRequest;
99  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
100 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableRequest;
101 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
102 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableRequest;
103 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
104 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsRequest;
105 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
106 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorRequest;
107 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
108 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableRequest;
109 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
110 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest;
111 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse;
112 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusRequest;
113 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusResponse;
114 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsRequest;
115 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
116 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorRequest;
117 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorResponse;
118 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusRequest;
119 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusResponse;
120 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
123 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesResponse;
124 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsBalancerEnabledRequest;
125 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsBalancerEnabledResponse;
126 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
128 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
129 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
130 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest;
131 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse;
132 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest;
133 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
134 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneRequest;
135 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
136 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsRequest;
137 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
138 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
139 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
140 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
141 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
142 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
143 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnRequest;
144 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
145 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceRequest;
146 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
147 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableRequest;
148 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
149 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionRequest;
150 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
151 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionRequest;
152 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
153 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotRequest;
154 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
155 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanRequest;
156 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
157 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SecurityCapabilitiesRequest;
158 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SecurityCapabilitiesResponse;
159 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
160 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
161 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotRequest;
162 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
163 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownRequest;
164 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
165 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterRequest;
166 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
167 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableRequest;
168 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableResponse;
169 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionRequest;
170 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
171 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
172 import org.apache.hadoop.hbase.security.User;
173 import org.apache.hadoop.hbase.security.UserProvider;
174 import org.apache.hadoop.hbase.util.Bytes;
175 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
176 import org.apache.hadoop.hbase.util.ExceptionUtil;
177 import org.apache.hadoop.hbase.util.Threads;
178 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
179 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
180 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
181 import org.apache.hadoop.ipc.RemoteException;
182 import org.apache.zookeeper.KeeperException;
183 
184 import com.google.common.annotations.VisibleForTesting;
185 import com.google.protobuf.BlockingRpcChannel;
186 import com.google.protobuf.RpcController;
187 import com.google.protobuf.ServiceException;
188 
189 /**
190  * A non-instantiable class that manages creation of {@link HConnection}s.
191  * <p>The simplest way to use this class is by using {@link #createConnection(Configuration)}.
192  * This creates a new {@link HConnection} to the cluster that is managed by the caller.
193  * From this {@link HConnection} {@link HTableInterface} implementations are retrieved
194  * with {@link HConnection#getTable(byte[])}. Example:
195  * <pre>
196  * {@code
197  * HConnection connection = HConnectionManager.createConnection(config);
198  * HTableInterface table = connection.getTable("table1");
199  * try {
200  *   // Use the table as needed, for a single operation and a single thread
201  * } finally {
202  *   table.close();
203  *   connection.close();
204  * }
205  * }</pre>
206  * <p>This class has a static Map of {@link HConnection} instances keyed by
207  * {@link HConnectionKey}; A {@link HConnectionKey} is identified by a set of
208  * {@link Configuration} properties. Invocations of {@link #getConnection(Configuration)}
209  * that pass the same {@link Configuration} instance will return the same
210  * {@link  HConnection} instance ONLY WHEN the set of properties are the same
211  * (i.e. if you change properties in your {@link Configuration} instance, such as RPC timeout,
212  * the codec used, HBase will create a new {@link HConnection} instance. For more details on
213  * how this is done see {@link HConnectionKey}).
214  * <p>Sharing {@link HConnection} instances is usually what you want; all clients
215  * of the {@link HConnection} instances share the HConnections' cache of Region
216  * locations rather than each having to discover for itself the location of meta, etc.
217  * But sharing connections makes clean up of {@link HConnection} instances a little awkward.
218  * Currently, clients cleanup by calling {@link #deleteConnection(Configuration)}. This will
219  * shutdown the zookeeper connection the HConnection was using and clean up all
220  * HConnection resources as well as stopping proxies to servers out on the
221  * cluster. Not running the cleanup will not end the world; it'll
222  * just stall the closeup some and spew some zookeeper connection failed
223  * messages into the log.  Running the cleanup on a {@link HConnection} that is
224  * subsequently used by another will cause breakage so be careful running
225  * cleanup.
226  * <p>To create a {@link HConnection} that is not shared by others, you can
227  * set property "hbase.client.instance.id" to a unique value for your {@link Configuration}
228  * instance, like the following:
229  * <pre>
230  * {@code
231  * conf.set("hbase.client.instance.id", "12345");
232  * HConnection connection = HConnectionManager.getConnection(conf);
233  * // Use the connection to your hearts' delight and then when done...
234  * conf.set("hbase.client.instance.id", "12345");
235  * HConnectionManager.deleteConnection(conf, true);
236  * }
237  * </pre>
238  * <p>Cleanup used to be done inside in a shutdown hook.  On startup we'd
239  * register a shutdown hook that called {@link #deleteAllConnections()}
240  * on its way out but the order in which shutdown hooks run is not defined so
241  * were problematic for clients of HConnection that wanted to register their
242  * own shutdown hooks so we removed ours though this shifts the onus for
243  * cleanup to the client.
244  */
245 @SuppressWarnings("serial")
246 @InterfaceAudience.Public
247 @InterfaceStability.Evolving
248 public class HConnectionManager {
249   static final Log LOG = LogFactory.getLog(HConnectionManager.class);
250 
251   public static final String RETRIES_BY_SERVER_KEY = "hbase.client.retries.by.server";
252   private static final String CLIENT_NONCES_ENABLED_KEY = "hbase.client.nonces.enabled";
253 
254   // An LRU Map of HConnectionKey -> HConnection (TableServer).  All
255   // access must be synchronized.  This map is not private because tests
256   // need to be able to tinker with it.
257   static final Map<HConnectionKey, HConnectionImplementation> CONNECTION_INSTANCES;
258 
259   public static final int MAX_CACHED_CONNECTION_INSTANCES;
260 
261   /**
262    * Global nonceGenerator shared per client.Currently there's no reason to limit its scope.
263    * Once it's set under nonceGeneratorCreateLock, it is never unset or changed.
264    */
265   private static volatile NonceGenerator nonceGenerator = null;
266   /** The nonce generator lock. Only taken when creating HConnection, which gets a private copy. */
267   private static Object nonceGeneratorCreateLock = new Object();
268 
269   static {
270     // We set instances to one more than the value specified for {@link
271     // HConstants#ZOOKEEPER_MAX_CLIENT_CNXNS}. By default, the zk default max
272     // connections to the ensemble from the one client is 30, so in that case we
273     // should run into zk issues before the LRU hit this value of 31.
274     MAX_CACHED_CONNECTION_INSTANCES = HBaseConfiguration.create().getInt(
275       HConstants.ZOOKEEPER_MAX_CLIENT_CNXNS, HConstants.DEFAULT_ZOOKEPER_MAX_CLIENT_CNXNS) + 1;
276     CONNECTION_INSTANCES = new LinkedHashMap<HConnectionKey, HConnectionImplementation>(
277         (int) (MAX_CACHED_CONNECTION_INSTANCES / 0.75F) + 1, 0.75F, true) {
278       @Override
279       protected boolean removeEldestEntry(
280           Map.Entry<HConnectionKey, HConnectionImplementation> eldest) {
281          return size() > MAX_CACHED_CONNECTION_INSTANCES;
282        }
283     };
284   }
285 
286   /*
287    * Non-instantiable.
288    */
289   private HConnectionManager() {
290     super();
291   }
292 
293   /**
294    * @param conn The connection for which to replace the generator.
295    * @param cnm Replaces the nonce generator used, for testing.
296    * @return old nonce generator.
297    */
298   @VisibleForTesting
299   public static NonceGenerator injectNonceGeneratorForTesting(
300       HConnection conn, NonceGenerator cnm) {
301     NonceGenerator ng = conn.getNonceGenerator();
302     LOG.warn("Nonce generator is being replaced by test code for " + cnm.getClass().getName());
303     ((HConnectionImplementation)conn).nonceGenerator = cnm;
304     return ng;
305   }
306 
307   /**
308    * Get the connection that goes with the passed <code>conf</code> configuration instance.
309    * If no current connection exists, method creates a new connection and keys it using
310    * connection-specific properties from the passed {@link Configuration}; see
311    * {@link HConnectionKey}.
312    * @param conf configuration
313    * @return HConnection object for <code>conf</code>
314    * @throws ZooKeeperConnectionException
315    */
316   @Deprecated
317   public static HConnection getConnection(final Configuration conf)
318   throws IOException {
319     HConnectionKey connectionKey = new HConnectionKey(conf);
320     synchronized (CONNECTION_INSTANCES) {
321       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
322       if (connection == null) {
323         connection = (HConnectionImplementation)createConnection(conf, true);
324         CONNECTION_INSTANCES.put(connectionKey, connection);
325       } else if (connection.isClosed()) {
326         HConnectionManager.deleteConnection(connectionKey, true);
327         connection = (HConnectionImplementation)createConnection(conf, true);
328         CONNECTION_INSTANCES.put(connectionKey, connection);
329       }
330       connection.incCount();
331       return connection;
332     }
333   }
334 
335   /**
336    * Create a new HConnection instance using the passed <code>conf</code> instance.
337    * <p>Note: This bypasses the usual HConnection life cycle management done by
338    * {@link #getConnection(Configuration)}. The caller is responsible for
339    * calling {@link HConnection#close()} on the returned connection instance.
340    *
341    * This is the recommended way to create HConnections.
342    * {@code
343    * HConnection connection = HConnectionManager.createConnection(conf);
344    * HTableInterface table = connection.getTable("mytable");
345    * table.get(...);
346    * ...
347    * table.close();
348    * connection.close();
349    * }
350    *
351    * @param conf configuration
352    * @return HConnection object for <code>conf</code>
353    * @throws ZooKeeperConnectionException
354    */
355   public static HConnection createConnection(Configuration conf)
356   throws IOException {
357     UserProvider provider = UserProvider.instantiate(conf);
358     return createConnection(conf, false, null, provider.getCurrent());
359   }
360 
361   /**
362    * Create a new HConnection instance using the passed <code>conf</code> instance.
363    * <p>Note: This bypasses the usual HConnection life cycle management done by
364    * {@link #getConnection(Configuration)}. The caller is responsible for
365    * calling {@link HConnection#close()} on the returned connection instance.
366    * This is the recommended way to create HConnections.
367    * {@code
368    * ExecutorService pool = ...;
369    * HConnection connection = HConnectionManager.createConnection(conf, pool);
370    * HTableInterface table = connection.getTable("mytable");
371    * table.get(...);
372    * ...
373    * table.close();
374    * connection.close();
375    * }
376    * @param conf configuration
377    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
378    * @return HConnection object for <code>conf</code>
379    * @throws ZooKeeperConnectionException
380    */
381   public static HConnection createConnection(Configuration conf, ExecutorService pool)
382   throws IOException {
383     UserProvider provider = UserProvider.instantiate(conf);
384     return createConnection(conf, false, pool, provider.getCurrent());
385   }
386 
387   /**
388    * Create a new HConnection instance using the passed <code>conf</code> instance.
389    * <p>Note: This bypasses the usual HConnection life cycle management done by
390    * {@link #getConnection(Configuration)}. The caller is responsible for
391    * calling {@link HConnection#close()} on the returned connection instance.
392    * This is the recommended way to create HConnections.
393    * {@code
394    * ExecutorService pool = ...;
395    * HConnection connection = HConnectionManager.createConnection(conf, pool);
396    * HTableInterface table = connection.getTable("mytable");
397    * table.get(...);
398    * ...
399    * table.close();
400    * connection.close();
401    * }
402    * @param conf configuration
403    * @param user the user the connection is for
404    * @return HConnection object for <code>conf</code>
405    * @throws ZooKeeperConnectionException
406    */
407   public static HConnection createConnection(Configuration conf, User user)
408   throws IOException {
409     return createConnection(conf, false, null, user);
410   }
411 
412   /**
413    * Create a new HConnection instance using the passed <code>conf</code> instance.
414    * <p>Note: This bypasses the usual HConnection life cycle management done by
415    * {@link #getConnection(Configuration)}. The caller is responsible for
416    * calling {@link HConnection#close()} on the returned connection instance.
417    * This is the recommended way to create HConnections.
418    * {@code
419    * ExecutorService pool = ...;
420    * HConnection connection = HConnectionManager.createConnection(conf, pool);
421    * HTableInterface table = connection.getTable("mytable");
422    * table.get(...);
423    * ...
424    * table.close();
425    * connection.close();
426    * }
427    * @param conf configuration
428    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
429    * @param user the user the connection is for
430    * @return HConnection object for <code>conf</code>
431    * @throws ZooKeeperConnectionException
432    */
433   public static HConnection createConnection(Configuration conf, ExecutorService pool, User user)
434   throws IOException {
435     return createConnection(conf, false, pool, user);
436   }
437 
438   @Deprecated
439   static HConnection createConnection(final Configuration conf, final boolean managed)
440       throws IOException {
441     UserProvider provider = UserProvider.instantiate(conf);
442     return createConnection(conf, managed, null, provider.getCurrent());
443   }
444 
445   @Deprecated
446   static HConnection createConnection(final Configuration conf, final boolean managed,
447       final ExecutorService pool, final User user)
448   throws IOException {
449     String className = conf.get("hbase.client.connection.impl",
450       HConnectionManager.HConnectionImplementation.class.getName());
451     Class<?> clazz = null;
452     try {
453       clazz = Class.forName(className);
454     } catch (ClassNotFoundException e) {
455       throw new IOException(e);
456     }
457     try {
458       // Default HCM#HCI is not accessible; make it so before invoking.
459       Constructor<?> constructor =
460         clazz.getDeclaredConstructor(Configuration.class,
461           boolean.class, ExecutorService.class, User.class);
462       constructor.setAccessible(true);
463       return (HConnection) constructor.newInstance(conf, managed, pool, user);
464     } catch (Exception e) {
465       throw new IOException(e);
466     }
467   }
468 
469   /**
470    * Delete connection information for the instance specified by passed configuration.
471    * If there are no more references to the designated connection connection, this method will
472    * then close connection to the zookeeper ensemble and let go of all associated resources.
473    *
474    * @param conf configuration whose identity is used to find {@link HConnection} instance.
475    * @deprecated
476    */
477   public static void deleteConnection(Configuration conf) {
478     deleteConnection(new HConnectionKey(conf), false);
479   }
480 
481   /**
482    * Cleanup a known stale connection.
483    * This will then close connection to the zookeeper ensemble and let go of all resources.
484    *
485    * @param connection
486    * @deprecated
487    */
488   public static void deleteStaleConnection(HConnection connection) {
489     deleteConnection(connection, true);
490   }
491 
492   /**
493    * Delete information for all connections. Close or not the connection, depending on the
494    *  staleConnection boolean and the ref count. By default, you should use it with
495    *  staleConnection to true.
496    * @deprecated
497    */
498   public static void deleteAllConnections(boolean staleConnection) {
499     synchronized (CONNECTION_INSTANCES) {
500       Set<HConnectionKey> connectionKeys = new HashSet<HConnectionKey>();
501       connectionKeys.addAll(CONNECTION_INSTANCES.keySet());
502       for (HConnectionKey connectionKey : connectionKeys) {
503         deleteConnection(connectionKey, staleConnection);
504       }
505       CONNECTION_INSTANCES.clear();
506     }
507   }
508 
509   /**
510    * Delete information for all connections..
511    * @deprecated kept for backward compatibility, but the behavior is broken. HBASE-8983
512    */
513   @Deprecated
514   public static void deleteAllConnections() {
515     deleteAllConnections(false);
516   }
517 
518 
519   @Deprecated
520   private static void deleteConnection(HConnection connection, boolean staleConnection) {
521     synchronized (CONNECTION_INSTANCES) {
522       for (Entry<HConnectionKey, HConnectionImplementation> e: CONNECTION_INSTANCES.entrySet()) {
523         if (e.getValue() == connection) {
524           deleteConnection(e.getKey(), staleConnection);
525           break;
526         }
527       }
528     }
529   }
530 
531   @Deprecated
532   private static void deleteConnection(HConnectionKey connectionKey, boolean staleConnection) {
533     synchronized (CONNECTION_INSTANCES) {
534       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
535       if (connection != null) {
536         connection.decCount();
537         if (connection.isZeroReference() || staleConnection) {
538           CONNECTION_INSTANCES.remove(connectionKey);
539           connection.internalClose();
540         }
541       } else {
542         LOG.error("Connection not found in the list, can't delete it "+
543           "(connection key=" + connectionKey + "). May be the key was modified?", new Exception());
544       }
545     }
546   }
547 
548   /**
549    * It is provided for unit test cases which verify the behavior of region
550    * location cache prefetch.
551    * @return Number of cached regions for the table.
552    * @throws ZooKeeperConnectionException
553    */
554   static int getCachedRegionCount(Configuration conf, final TableName tableName)
555   throws IOException {
556     return execute(new HConnectable<Integer>(conf) {
557       @Override
558       public Integer connect(HConnection connection) {
559         return ((HConnectionImplementation)connection).getNumberOfCachedRegionLocations(tableName);
560       }
561     });
562   }
563 
564   /**
565    * This convenience method invokes the given {@link HConnectable#connect}
566    * implementation using a {@link HConnection} instance that lasts just for the
567    * duration of the invocation.
568    *
569    * @param <T> the return type of the connect method
570    * @param connectable the {@link HConnectable} instance
571    * @return the value returned by the connect method
572    * @throws IOException
573    */
574   @InterfaceAudience.Private
575   public static <T> T execute(HConnectable<T> connectable) throws IOException {
576     if (connectable == null || connectable.conf == null) {
577       return null;
578     }
579     Configuration conf = connectable.conf;
580     HConnection connection = HConnectionManager.getConnection(conf);
581     boolean connectSucceeded = false;
582     try {
583       T returnValue = connectable.connect(connection);
584       connectSucceeded = true;
585       return returnValue;
586     } finally {
587       try {
588         connection.close();
589       } catch (Exception e) {
590         ExceptionUtil.rethrowIfInterrupt(e);
591         if (connectSucceeded) {
592           throw new IOException("The connection to " + connection
593               + " could not be deleted.", e);
594         }
595       }
596     }
597   }
598 
599   /** Encapsulates connection to zookeeper and regionservers.*/
600   @InterfaceAudience.Private
601   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
602       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
603       justification="Access to the conncurrent hash map is under a lock so should be fine.")
604   public static class HConnectionImplementation implements HConnection, Closeable {
605     static final Log LOG = LogFactory.getLog(HConnectionImplementation.class);
606     private final long pause;
607     private final int numTries;
608     final int rpcTimeout;
609     private NonceGenerator nonceGenerator = null;
610     private final boolean usePrefetch;
611     private final int prefetchRegionLimit;
612 
613     private volatile boolean closed;
614     private volatile boolean aborted;
615 
616     // package protected for the tests
617     ClusterStatusListener clusterStatusListener;
618 
619     private final Object userRegionLock = new Object();
620     private final Object metaRegionLock = new Object();
621 
622     // We have a single lock for master & zk to prevent deadlocks. Having
623     //  one lock for ZK and one lock for master is not possible:
624     //  When creating a connection to master, we need a connection to ZK to get
625     //  its address. But another thread could have taken the ZK lock, and could
626     //  be waiting for the master lock => deadlock.
627     private final Object masterAndZKLock = new Object();
628 
629     private long keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
630     private final DelayedClosing delayedClosing =
631       DelayedClosing.createAndStart(this);
632 
633     // thread executor shared by all HTableInterface instances created
634     // by this connection
635     private volatile ExecutorService batchPool = null;
636     private volatile boolean cleanupPool = false;
637 
638     private final Configuration conf;
639 
640     // cache the configuration value for tables so that we can avoid calling
641     // the expensive Configuration to fetch the value multiple times.
642     private final TableConfiguration tableConfig;
643 
644     // Client rpc instance.
645     private RpcClient rpcClient;
646 
647     private final MetricsConnection metrics;
648 
649     /**
650       * Map of table to table {@link HRegionLocation}s.
651       */
652     private final ConcurrentMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>
653         cachedRegionLocations =
654       new ConcurrentHashMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>();
655 
656     // The presence of a server in the map implies it's likely that there is an
657     // entry in cachedRegionLocations that map to this server; but the absence
658     // of a server in this map guarentees that there is no entry in cache that
659     // maps to the absent server.
660     // The access to this attribute must be protected by a lock on cachedRegionLocations
661     private final Set<ServerName> cachedServers = new ConcurrentSkipListSet<ServerName>();
662 
663     // region cache prefetch is enabled by default. this set contains all
664     // tables whose region cache prefetch are disabled.
665     private final Set<Integer> regionCachePrefetchDisabledTables =
666       new CopyOnWriteArraySet<Integer>();
667 
668     private int refCount;
669 
670     // indicates whether this connection's life cycle is managed (by us)
671     private boolean managed;
672 
673     protected User user;
674 
675     private RpcRetryingCallerFactory rpcCallerFactory;
676 
677     private RpcControllerFactory rpcControllerFactory;
678 
679     // single tracker per connection
680     private final ServerStatisticTracker stats;
681 
682     private final ClientBackoffPolicy backoffPolicy;
683 
684     /**
685      * Cluster registry of basic info such as clusterid and meta region location.
686      */
687      Registry registry;
688 
689      HConnectionImplementation(Configuration conf, boolean managed) throws IOException {
690        this(conf, managed, null, null);
691      }
692 
693     /**
694      * constructor
695      * @param conf Configuration object
696      * @param managed If true, does not do full shutdown on close; i.e. cleanup of connection
697      * to zk and shutdown of all services; we just close down the resources this connection was
698      * responsible for and decrement usage counters.  It is up to the caller to do the full
699      * cleanup.  It is set when we want have connection sharing going on -- reuse of zk connection,
700      * and cached region locations, established regionserver connections, etc.  When connections
701      * are shared, we have reference counting going on and will only do full cleanup when no more
702      * users of an HConnectionImplementation instance.
703      */
704     HConnectionImplementation(Configuration conf, boolean managed,
705         ExecutorService pool, User user) throws IOException {
706       this(conf);
707       this.user = user;
708       this.batchPool = pool;
709       this.managed = managed;
710       this.registry = setupRegistry();
711       retrieveClusterId();
712 
713       this.rpcClient = new RpcClient(this.conf, this.clusterId, this.metrics);
714 
715       // Do we publish the status?
716       boolean shouldListen = conf.getBoolean(HConstants.STATUS_PUBLISHED,
717           HConstants.STATUS_PUBLISHED_DEFAULT);
718       Class<? extends ClusterStatusListener.Listener> listenerClass =
719           conf.getClass(ClusterStatusListener.STATUS_LISTENER_CLASS,
720               ClusterStatusListener.DEFAULT_STATUS_LISTENER_CLASS,
721               ClusterStatusListener.Listener.class);
722       if (shouldListen) {
723         if (listenerClass == null) {
724           LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
725               ClusterStatusListener.STATUS_LISTENER_CLASS + " is not set - not listening status");
726         } else {
727           clusterStatusListener = new ClusterStatusListener(
728               new ClusterStatusListener.DeadServerHandler() {
729                 @Override
730                 public void newDead(ServerName sn) {
731                   clearCaches(sn);
732                   rpcClient.cancelConnections(sn.getHostname(), sn.getPort(),
733                       new SocketException(sn.getServerName() +
734                           " is dead: closing its connection."));
735                 }
736               }, conf, listenerClass);
737         }
738       }
739 
740       this.rpcCallerFactory = RpcRetryingCallerFactory.instantiate(conf, this.stats);
741       this.rpcControllerFactory = RpcControllerFactory.instantiate(conf);
742     }
743 
744     /** Dummy nonce generator for disabled nonces. */
745     private static class NoNonceGenerator implements NonceGenerator {
746       @Override
747       public long getNonceGroup() {
748         return HConstants.NO_NONCE;
749       }
750       @Override
751       public long newNonce() {
752         return HConstants.NO_NONCE;
753       }
754     }
755 
756     /**
757      * For tests.
758      */
759     protected HConnectionImplementation(Configuration conf) {
760       this.conf = conf;
761       this.tableConfig = new TableConfiguration(conf);
762       this.closed = false;
763       this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
764           HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
765       this.numTries = tableConfig.getRetriesNumber();
766       this.rpcTimeout = conf.getInt(
767           HConstants.HBASE_RPC_TIMEOUT_KEY,
768           HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
769       if (conf.getBoolean(CLIENT_NONCES_ENABLED_KEY, true)) {
770         synchronized (HConnectionManager.nonceGeneratorCreateLock) {
771           if (HConnectionManager.nonceGenerator == null) {
772             HConnectionManager.nonceGenerator = new PerClientRandomNonceGenerator();
773           }
774           this.nonceGenerator = HConnectionManager.nonceGenerator;
775         }
776       } else {
777         this.nonceGenerator = new NoNonceGenerator();
778       }
779 
780       this.stats = ServerStatisticTracker.create(conf);
781       this.usePrefetch = conf.getBoolean(HConstants.HBASE_CLIENT_PREFETCH,
782           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH);
783       this.prefetchRegionLimit = conf.getInt(
784           HConstants.HBASE_CLIENT_PREFETCH_LIMIT,
785           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH_LIMIT);
786       this.rpcControllerFactory = RpcControllerFactory.instantiate(conf);
787       this.rpcCallerFactory = RpcRetryingCallerFactory.instantiate(conf, this.stats);
788       this.backoffPolicy = ClientBackoffPolicyFactory.create(conf);
789       if (conf.getBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, false)) {
790         this.metrics = new MetricsConnection(this);
791       } else {
792         this.metrics = null;
793       }
794     }
795 
796     @Override
797     public HTableInterface getTable(String tableName) throws IOException {
798       return getTable(TableName.valueOf(tableName));
799     }
800 
801     @Override
802     public HTableInterface getTable(byte[] tableName) throws IOException {
803       return getTable(TableName.valueOf(tableName));
804     }
805 
806     @Override
807     public HTableInterface getTable(TableName tableName) throws IOException {
808       return getTable(tableName, getBatchPool());
809     }
810 
811     @Override
812     public HTableInterface getTable(String tableName, ExecutorService pool) throws IOException {
813       return getTable(TableName.valueOf(tableName), pool);
814     }
815 
816     @Override
817     public HTableInterface getTable(byte[] tableName, ExecutorService pool) throws IOException {
818       return getTable(TableName.valueOf(tableName), pool);
819     }
820 
821     @Override
822     public HTableInterface getTable(TableName tableName, ExecutorService pool) throws IOException {
823       if (managed) {
824         throw new IOException("The connection has to be unmanaged.");
825       }
826       return new HTable(tableName, this, tableConfig, rpcCallerFactory, rpcControllerFactory,
827         pool);
828     }
829 
830     @Override
831     public MetricsConnection getConnectionMetrics() {
832       return this.metrics;
833     }
834 
835     private ExecutorService getBatchPool() {
836       if (batchPool == null) {
837         // shared HTable thread executor not yet initialized
838         synchronized (this) {
839           if (batchPool == null) {
840             int maxThreads = conf.getInt("hbase.hconnection.threads.max", 256);
841             int coreThreads = conf.getInt("hbase.hconnection.threads.core", 256);
842             if (maxThreads == 0) {
843               maxThreads = Runtime.getRuntime().availableProcessors() * 8;
844             }
845             if (coreThreads == 0) {
846               coreThreads = Runtime.getRuntime().availableProcessors() * 8;
847             }
848             long keepAliveTime = conf.getLong("hbase.hconnection.threads.keepalivetime", 60);
849             LinkedBlockingQueue<Runnable> workQueue =
850               new LinkedBlockingQueue<Runnable>(maxThreads *
851                 conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
852                   HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
853             ThreadPoolExecutor tpe = new ThreadPoolExecutor(
854                 coreThreads,
855                 maxThreads,
856                 keepAliveTime,
857                 TimeUnit.SECONDS,
858                 workQueue,
859                 Threads.newDaemonThreadFactory(toString() + "-shared-"));
860             tpe.allowCoreThreadTimeOut(true);
861             this.batchPool = tpe;
862           }
863           this.cleanupPool = true;
864         }
865       }
866       return this.batchPool;
867     }
868 
869     protected ExecutorService getCurrentBatchPool() {
870       return batchPool;
871     }
872 
873     private void shutdownBatchPool() {
874       if (this.cleanupPool && this.batchPool != null && !this.batchPool.isShutdown()) {
875         this.batchPool.shutdown();
876         try {
877           if (!this.batchPool.awaitTermination(10, TimeUnit.SECONDS)) {
878             this.batchPool.shutdownNow();
879           }
880         } catch (InterruptedException e) {
881           this.batchPool.shutdownNow();
882         }
883       }
884     }
885 
886     /**
887      * @return The cluster registry implementation to use.
888      * @throws IOException
889      */
890     private Registry setupRegistry() throws IOException {
891       String registryClass = this.conf.get("hbase.client.registry.impl",
892         ZooKeeperRegistry.class.getName());
893       Registry registry = null;
894       try {
895         registry = (Registry)Class.forName(registryClass).newInstance();
896       } catch (Throwable t) {
897         throw new IOException(t);
898       }
899       registry.init(this);
900       return registry;
901     }
902 
903     /**
904      * For tests only.
905      * @param rpcClient Client we should use instead.
906      * @return Previous rpcClient
907      */
908     RpcClient setRpcClient(final RpcClient rpcClient) {
909       RpcClient oldRpcClient = this.rpcClient;
910       this.rpcClient = rpcClient;
911       return oldRpcClient;
912     }
913 
914     /**
915      * An identifier that will remain the same for a given connection.
916      * @return
917      */
918     public String toString(){
919       return "hconnection-0x" + Integer.toHexString(hashCode());
920     }
921 
922     protected String clusterId = null;
923 
924     void retrieveClusterId() {
925       if (clusterId != null) return;
926       this.clusterId = this.registry.getClusterId();
927       if (clusterId == null) {
928         clusterId = HConstants.CLUSTER_ID_DEFAULT;
929         LOG.debug("clusterid came back null, using default " + clusterId);
930       }
931     }
932 
933     @Override
934     public Configuration getConfiguration() {
935       return this.conf;
936     }
937 
938     private void checkIfBaseNodeAvailable(ZooKeeperWatcher zkw)
939       throws MasterNotRunningException {
940       String errorMsg;
941       try {
942         if (ZKUtil.checkExists(zkw, zkw.baseZNode) == -1) {
943           errorMsg = "The node " + zkw.baseZNode+" is not in ZooKeeper. "
944             + "It should have been written by the master. "
945             + "Check the value configured in 'zookeeper.znode.parent'. "
946             + "There could be a mismatch with the one configured in the master.";
947           LOG.error(errorMsg);
948           throw new MasterNotRunningException(errorMsg);
949         }
950       } catch (KeeperException e) {
951         errorMsg = "Can't get connection to ZooKeeper: " + e.getMessage();
952         LOG.error(errorMsg);
953         throw new MasterNotRunningException(errorMsg, e);
954       }
955     }
956 
957     /**
958      * @return true if the master is running, throws an exception otherwise
959      * @throws MasterNotRunningException - if the master is not running
960      * @throws ZooKeeperConnectionException
961      */
962     @Override
963     public boolean isMasterRunning()
964     throws MasterNotRunningException, ZooKeeperConnectionException {
965       // When getting the master connection, we check it's running,
966       // so if there is no exception, it means we've been able to get a
967       // connection on a running master
968       MasterKeepAliveConnection m = getKeepAliveMasterService();
969       m.close();
970       return true;
971     }
972 
973     @Override
974     public HRegionLocation getRegionLocation(final TableName tableName,
975         final byte [] row, boolean reload)
976     throws IOException {
977       return reload? relocateRegion(tableName, row): locateRegion(tableName, row);
978     }
979 
980     @Override
981     public HRegionLocation getRegionLocation(final byte[] tableName,
982         final byte [] row, boolean reload)
983     throws IOException {
984       return getRegionLocation(TableName.valueOf(tableName), row, reload);
985     }
986 
987     @Override
988     public boolean isTableEnabled(TableName tableName) throws IOException {
989       return this.registry.isTableOnlineState(tableName, true);
990     }
991 
992     @Override
993     public boolean isTableEnabled(byte[] tableName) throws IOException {
994       return isTableEnabled(TableName.valueOf(tableName));
995     }
996 
997     @Override
998     public boolean isTableDisabled(TableName tableName) throws IOException {
999       return this.registry.isTableOnlineState(tableName, false);
1000     }
1001 
1002     @Override
1003     public boolean isTableDisabled(byte[] tableName) throws IOException {
1004       return isTableDisabled(TableName.valueOf(tableName));
1005     }
1006 
1007     @Override
1008     public boolean isTableAvailable(final TableName tableName) throws IOException {
1009       final AtomicBoolean available = new AtomicBoolean(true);
1010       final AtomicInteger regionCount = new AtomicInteger(0);
1011       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1012         @Override
1013         public boolean processRow(Result row) throws IOException {
1014           HRegionInfo info = MetaScanner.getHRegionInfo(row);
1015           if (info != null && !info.isSplitParent()) {
1016             if (tableName.equals(info.getTable())) {
1017               ServerName server = HRegionInfo.getServerName(row);
1018               if (server == null) {
1019                 available.set(false);
1020                 return false;
1021               }
1022               regionCount.incrementAndGet();
1023             } else if (tableName.compareTo(info.getTable()) < 0) {
1024               // Return if we are done with the current table
1025               return false;
1026             }
1027           }
1028           return true;
1029         }
1030       };
1031       MetaScanner.metaScan(conf, this, visitor, tableName);
1032       return available.get() && (regionCount.get() > 0);
1033     }
1034 
1035     @Override
1036     public boolean isTableAvailable(final byte[] tableName) throws IOException {
1037       return isTableAvailable(TableName.valueOf(tableName));
1038     }
1039 
1040     @Override
1041     public boolean isTableAvailable(final TableName tableName, final byte[][] splitKeys)
1042         throws IOException {
1043       final AtomicBoolean available = new AtomicBoolean(true);
1044       final AtomicInteger regionCount = new AtomicInteger(0);
1045       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1046         @Override
1047         public boolean processRow(Result row) throws IOException {
1048           HRegionInfo info = MetaScanner.getHRegionInfo(row);
1049           if (info != null && !info.isSplitParent()) {
1050             if (tableName.equals(info.getTable())) {
1051               ServerName server = HRegionInfo.getServerName(row);
1052               if (server == null) {
1053                 available.set(false);
1054                 return false;
1055               }
1056               if (!Bytes.equals(info.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
1057                 for (byte[] splitKey : splitKeys) {
1058                   // Just check if the splitkey is available
1059                   if (Bytes.equals(info.getStartKey(), splitKey)) {
1060                     regionCount.incrementAndGet();
1061                     break;
1062                   }
1063                 }
1064               } else {
1065                 // Always empty start row should be counted
1066                 regionCount.incrementAndGet();
1067               }
1068             } else if (tableName.compareTo(info.getTable()) < 0) {
1069               // Return if we are done with the current table
1070               return false;
1071             }
1072           }
1073           return true;
1074         }
1075       };
1076       MetaScanner.metaScan(conf, this, visitor, tableName);
1077       // +1 needs to be added so that the empty start row is also taken into account
1078       return available.get() && (regionCount.get() == splitKeys.length + 1);
1079     }
1080 
1081     @Override
1082     public boolean isTableAvailable(final byte[] tableName, final byte[][] splitKeys)
1083         throws IOException {
1084       return isTableAvailable(TableName.valueOf(tableName), splitKeys);
1085     }
1086 
1087     @Override
1088     public HRegionLocation locateRegion(final byte[] regionName) throws IOException {
1089       return locateRegion(HRegionInfo.getTable(regionName),
1090           HRegionInfo.getStartKey(regionName), false, true);
1091     }
1092 
1093     @Override
1094     public boolean isDeadServer(ServerName sn) {
1095       if (clusterStatusListener == null) {
1096         return false;
1097       } else {
1098         return clusterStatusListener.isDeadServer(sn);
1099       }
1100     }
1101 
1102     @Override
1103     public List<HRegionLocation> locateRegions(final TableName tableName)
1104     throws IOException {
1105       return locateRegions (tableName, false, true);
1106     }
1107 
1108     @Override
1109     public List<HRegionLocation> locateRegions(final byte[] tableName)
1110     throws IOException {
1111       return locateRegions(TableName.valueOf(tableName));
1112     }
1113 
1114     @Override
1115     public List<HRegionLocation> locateRegions(final TableName tableName,
1116         final boolean useCache, final boolean offlined) throws IOException {
1117       NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(conf, this,
1118           tableName, offlined);
1119       final List<HRegionLocation> locations = new ArrayList<HRegionLocation>();
1120       for (HRegionInfo regionInfo : regions.keySet()) {
1121         locations.add(locateRegion(tableName, regionInfo.getStartKey(), useCache, true));
1122       }
1123       return locations;
1124     }
1125 
1126     @Override
1127     public List<HRegionLocation> locateRegions(final byte[] tableName,
1128        final boolean useCache, final boolean offlined) throws IOException {
1129       return locateRegions(TableName.valueOf(tableName), useCache, offlined);
1130     }
1131 
1132     @Override
1133     public HRegionLocation locateRegion(final TableName tableName,
1134         final byte [] row)
1135     throws IOException{
1136       return locateRegion(tableName, row, true, true);
1137     }
1138 
1139     @Override
1140     public HRegionLocation locateRegion(final byte[] tableName,
1141         final byte [] row)
1142     throws IOException{
1143       return locateRegion(TableName.valueOf(tableName), row);
1144     }
1145 
1146     @Override
1147     public HRegionLocation relocateRegion(final TableName tableName,
1148         final byte [] row) throws IOException{
1149       // Since this is an explicit request not to use any caching, finding
1150       // disabled tables should not be desirable.  This will ensure that an exception is thrown when
1151       // the first time a disabled table is interacted with.
1152       if (isTableDisabled(tableName)) {
1153         throw new TableNotEnabledException(tableName.getNameAsString() + " is disabled.");
1154       }
1155 
1156       return locateRegion(tableName, row, false, true);
1157     }
1158 
1159     @Override
1160     public HRegionLocation relocateRegion(final byte[] tableName,
1161         final byte [] row) throws IOException {
1162       return relocateRegion(TableName.valueOf(tableName), row);
1163     }
1164 
1165 
1166     private HRegionLocation locateRegion(final TableName tableName,
1167       final byte [] row, boolean useCache, boolean retry)
1168     throws IOException {
1169       if (this.closed) throw new IOException(toString() + " closed");
1170       if (tableName== null || tableName.getName().length == 0) {
1171         throw new IllegalArgumentException(
1172             "table name cannot be null or zero length");
1173       }
1174 
1175       if (tableName.equals(TableName.META_TABLE_NAME)) {
1176         return locateMeta(tableName, useCache);
1177       } else {
1178         // Region not in the cache - have to go to the meta RS
1179         return locateRegionInMeta(TableName.META_TABLE_NAME, tableName, row,
1180           useCache, userRegionLock, retry);
1181       }
1182     }
1183 
1184     private HRegionLocation locateMeta(final TableName tableName,
1185         boolean useCache) throws IOException {
1186       // HBASE-10785: We cache the location of the META itself, so that we are not overloading
1187       // zookeeper with one request for every region lookup. We cache the META with empty row
1188       // key in MetaCache.
1189       byte[] metaCacheKey = HConstants.EMPTY_START_ROW; // use byte[0] as the row for meta
1190       HRegionLocation location = null;
1191       if (useCache) {
1192         location = getCachedLocation(tableName, metaCacheKey);
1193         if (location != null) {
1194           return location;
1195         }
1196       }
1197 
1198       // only one thread should do the lookup.
1199       synchronized (metaRegionLock) {
1200         // Check the cache again for a hit in case some other thread made the
1201         // same query while we were waiting on the lock.
1202         if (useCache) {
1203           location = getCachedLocation(tableName, metaCacheKey);
1204           if (location != null) {
1205             return location;
1206           }
1207         }
1208 
1209         // Look up from zookeeper
1210         location = this.registry.getMetaRegionLocation();
1211         if (location != null) {
1212           cacheLocation(tableName, null, location);
1213         }
1214       }
1215       return location;
1216     }
1217 
1218     /*
1219      * Search hbase:meta for the HRegionLocation info that contains the table and
1220      * row we're seeking. It will prefetch certain number of regions info and
1221      * save them to the global region cache.
1222      */
1223     private void prefetchRegionCache(final TableName tableName,
1224         final byte[] row) {
1225       // Implement a new visitor for MetaScanner, and use it to walk through
1226       // the hbase:meta
1227       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1228         public boolean processRow(Result result) throws IOException {
1229           try {
1230             HRegionInfo regionInfo = MetaScanner.getHRegionInfo(result);
1231             if (regionInfo == null) {
1232               return true;
1233             }
1234 
1235             // possible we got a region of a different table...
1236             if (!regionInfo.getTable().equals(tableName)) {
1237               return false; // stop scanning
1238             }
1239             if (regionInfo.isOffline()) {
1240               // don't cache offline regions
1241               return true;
1242             }
1243 
1244             ServerName serverName = HRegionInfo.getServerName(result);
1245             if (serverName == null) {
1246               return true; // don't cache it
1247             }
1248             // instantiate the location
1249             long seqNum = HRegionInfo.getSeqNumDuringOpen(result);
1250             HRegionLocation loc = new HRegionLocation(regionInfo, serverName, seqNum);
1251             // cache this meta entry
1252             cacheLocation(tableName, null, loc);
1253             return true;
1254           } catch (RuntimeException e) {
1255             throw new IOException(e);
1256           }
1257         }
1258       };
1259       try {
1260         // pre-fetch certain number of regions info at region cache.
1261         MetaScanner.metaScan(conf, this, visitor, tableName, row,
1262             this.prefetchRegionLimit, TableName.META_TABLE_NAME);
1263       } catch (IOException e) {
1264         if (ExceptionUtil.isInterrupt(e)) {
1265           Thread.currentThread().interrupt();
1266         }
1267       }
1268     }
1269 
1270     /*
1271       * Search the hbase:meta table for the HRegionLocation
1272       * info that contains the table and row we're seeking.
1273       */
1274     private HRegionLocation locateRegionInMeta(final TableName parentTable,
1275       final TableName tableName, final byte [] row, boolean useCache,
1276       Object regionLockObject, boolean retry)
1277     throws IOException {
1278       HRegionLocation location;
1279       // If we are supposed to be using the cache, look in the cache to see if
1280       // we already have the region.
1281       if (useCache) {
1282         location = getCachedLocation(tableName, row);
1283         if (location != null) {
1284           return location;
1285         }
1286       }
1287       int localNumRetries = retry ? numTries : 1;
1288       // build the key of the meta region we should be looking for.
1289       // the extra 9's on the end are necessary to allow "exact" matches
1290       // without knowing the precise region names.
1291       byte [] metaKey = HRegionInfo.createRegionName(tableName, row,
1292         HConstants.NINES, false);
1293       for (int tries = 0; true; tries++) {
1294         if (tries >= localNumRetries) {
1295           throw new NoServerForRegionException("Unable to find region for "
1296             + Bytes.toStringBinary(row) + " after " + numTries + " tries.");
1297         }
1298 
1299         HRegionLocation metaLocation = null;
1300         try {
1301           // locate the meta region
1302           metaLocation = locateRegion(parentTable, metaKey, true, false);
1303           // If null still, go around again.
1304           if (metaLocation == null) continue;
1305           ClientService.BlockingInterface service = getClient(metaLocation.getServerName());
1306 
1307           Result regionInfoRow;
1308           // This block guards against two threads trying to load the meta
1309           // region at the same time. The first will load the meta region and
1310           // the second will use the value that the first one found.
1311           if (useCache) {
1312             if (TableName.META_TABLE_NAME.equals(parentTable) && usePrefetch &&
1313                 getRegionCachePrefetch(tableName)) {
1314               synchronized (regionLockObject) {
1315                 // Check the cache again for a hit in case some other thread made the
1316                 // same query while we were waiting on the lock.
1317                 location = getCachedLocation(tableName, row);
1318                 if (location != null) {
1319                   return location;
1320                 }
1321                 // If the parent table is META, we may want to pre-fetch some
1322                 // region info into the global region cache for this table.
1323                 prefetchRegionCache(tableName, row);
1324               }
1325             }
1326             location = getCachedLocation(tableName, row);
1327             if (location != null) {
1328               return location;
1329             }
1330           } else {
1331             // If we are not supposed to be using the cache, delete any existing cached location
1332             // so it won't interfere.
1333             forceDeleteCachedLocation(tableName, row);
1334           }
1335 
1336           // Query the meta region for the location of the meta region
1337           regionInfoRow =
1338               ProtobufUtil.getRowOrBefore(service, metaLocation.getRegionInfo().getRegionName(),
1339                 metaKey, HConstants.CATALOG_FAMILY);
1340 
1341           if (regionInfoRow == null) {
1342             throw new TableNotFoundException(tableName);
1343           }
1344 
1345           // convert the row result into the HRegionLocation we need!
1346           HRegionInfo regionInfo = MetaScanner.getHRegionInfo(regionInfoRow);
1347           if (regionInfo == null) {
1348             throw new IOException("HRegionInfo was null or empty in " +
1349               parentTable + ", row=" + regionInfoRow);
1350           }
1351 
1352           // possible we got a region of a different table...
1353           if (!regionInfo.getTable().equals(tableName)) {
1354             throw new TableNotFoundException(
1355                   "Table '" + tableName + "' was not found, got: " +
1356                   regionInfo.getTable() + ".");
1357           }
1358           if (regionInfo.isSplit()) {
1359             throw new RegionOfflineException("the only available region for" +
1360               " the required row is a split parent," +
1361               " the daughters should be online soon: " +
1362               regionInfo.getRegionNameAsString());
1363           }
1364           if (regionInfo.isOffline()) {
1365             throw new RegionOfflineException("the region is offline, could" +
1366               " be caused by a disable table call: " +
1367               regionInfo.getRegionNameAsString());
1368           }
1369 
1370           ServerName serverName = HRegionInfo.getServerName(regionInfoRow);
1371           if (serverName == null) {
1372             throw new NoServerForRegionException("No server address listed " +
1373               "in " + parentTable + " for region " +
1374               regionInfo.getRegionNameAsString() + " containing row " +
1375               Bytes.toStringBinary(row));
1376           }
1377 
1378           if (isDeadServer(serverName)){
1379             throw new RegionServerStoppedException("hbase:meta says the region "+
1380                 regionInfo.getRegionNameAsString()+" is managed by the server " + serverName +
1381                 ", but it is dead.");
1382           }
1383 
1384           // Instantiate the location
1385           location = new HRegionLocation(regionInfo, serverName,
1386             HRegionInfo.getSeqNumDuringOpen(regionInfoRow));
1387           cacheLocation(tableName, null, location);
1388           return location;
1389         } catch (TableNotFoundException e) {
1390           // if we got this error, probably means the table just plain doesn't
1391           // exist. rethrow the error immediately. this should always be coming
1392           // from the HTable constructor.
1393           throw e;
1394         } catch (IOException e) {
1395           ExceptionUtil.rethrowIfInterrupt(e);
1396 
1397           if (e instanceof RemoteException) {
1398             e = ((RemoteException)e).unwrapRemoteException();
1399           }
1400           if (tries < numTries - 1) {
1401             if (LOG.isDebugEnabled()) {
1402               LOG.debug("locateRegionInMeta parentTable=" +
1403                 parentTable + ", metaLocation=" +
1404                 ((metaLocation == null)? "null": "{" + metaLocation + "}") +
1405                 ", attempt=" + tries + " of " +
1406                 this.numTries + " failed; retrying after sleep of " +
1407                 ConnectionUtils.getPauseTime(this.pause, tries) + " because: " + e.getMessage());
1408             }
1409           } else {
1410             throw e;
1411           }
1412           // Only relocate the parent region if necessary
1413           if(!(e instanceof RegionOfflineException ||
1414               e instanceof NoServerForRegionException)) {
1415             relocateRegion(parentTable, metaKey);
1416           }
1417         }
1418         try{
1419           Thread.sleep(ConnectionUtils.getPauseTime(this.pause, tries));
1420         } catch (InterruptedException e) {
1421           throw new InterruptedIOException("Giving up trying to location region in " +
1422             "meta: thread is interrupted.");
1423         }
1424       }
1425     }
1426 
1427     /*
1428      * Search the cache for a location that fits our table and row key.
1429      * Return null if no suitable region is located.
1430      *
1431      * @param tableName
1432      * @param row
1433      * @return Null or region location found in cache.
1434      */
1435     HRegionLocation getCachedLocation(final TableName tableName,
1436         final byte [] row) {
1437       ConcurrentSkipListMap<byte[], HRegionLocation> tableLocations =
1438         getTableLocations(tableName);
1439 
1440       Entry<byte[], HRegionLocation> e = tableLocations.floorEntry(row);
1441       if (e == null) {
1442         if (metrics != null) metrics.incrMetaCacheMiss();
1443         return null;
1444       }
1445       HRegionLocation possibleRegion = e.getValue();
1446 
1447       // make sure that the end key is greater than the row we're looking
1448       // for, otherwise the row actually belongs in the next region, not
1449       // this one. the exception case is when the endkey is
1450       // HConstants.EMPTY_END_ROW, signifying that the region we're
1451       // checking is actually the last region in the table.
1452       byte[] endKey = possibleRegion.getRegionInfo().getEndKey();
1453       if (Bytes.equals(endKey, HConstants.EMPTY_END_ROW) ||
1454           tableName.getRowComparator().compareRows(
1455               endKey, 0, endKey.length, row, 0, row.length) > 0) {
1456         if (metrics != null) metrics.incrMetaCacheHit();
1457         return possibleRegion;
1458       }
1459 
1460       // Passed all the way through, so we got nothing - complete cache miss
1461       if (metrics != null) metrics.incrMetaCacheMiss();
1462       return null;
1463     }
1464 
1465     /**
1466      * Delete a cached location, no matter what it is. Called when we were told to not use cache.
1467      * @param tableName tableName
1468      * @param row
1469      */
1470     void forceDeleteCachedLocation(final TableName tableName, final byte [] row) {
1471       HRegionLocation rl = null;
1472       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1473       // start to examine the cache. we can only do cache actions
1474       // if there's something in the cache for this table.
1475       rl = getCachedLocation(tableName, row);
1476       if (rl != null) {
1477         tableLocations.remove(rl.getRegionInfo().getStartKey());
1478       }
1479       if ((rl != null) && LOG.isDebugEnabled()) {
1480         LOG.debug("Removed " + rl.getHostname() + ":" + rl.getPort()
1481           + " as a location of " + rl.getRegionInfo().getRegionNameAsString() +
1482           " for tableName=" + tableName + " from cache");
1483       }
1484     }
1485 
1486     /*
1487      * Delete all cached entries of a table that maps to a specific location.
1488      */
1489     @Override
1490     public void clearCaches(final ServerName serverName) {
1491       if (!this.cachedServers.contains(serverName)) {
1492         return;
1493       }
1494 
1495       boolean deletedSomething = false;
1496       synchronized (this.cachedServers) {
1497         // We block here, because if there is an error on a server, it's likely that multiple
1498         //  threads will get the error  simultaneously. If there are hundreds of thousand of
1499         //  region location to check, it's better to do this only once. A better pattern would
1500         //  be to check if the server is dead when we get the region location.
1501         if (!this.cachedServers.contains(serverName)) {
1502           return;
1503         }
1504         for (Map<byte[], HRegionLocation> tableLocations : cachedRegionLocations.values()) {
1505           for (Entry<byte[], HRegionLocation> e : tableLocations.entrySet()) {
1506             HRegionLocation value = e.getValue();
1507             if (value != null
1508                 && serverName.equals(value.getServerName())) {
1509               tableLocations.remove(e.getKey());
1510               deletedSomething = true;
1511             }
1512           }
1513         }
1514         this.cachedServers.remove(serverName);
1515       }
1516       if (deletedSomething && LOG.isDebugEnabled()) {
1517         LOG.debug("Removed all cached region locations that map to " + serverName);
1518       }
1519     }
1520 
1521     /*
1522      * @param tableName
1523      * @return Map of cached locations for passed <code>tableName</code>
1524      */
1525     private ConcurrentSkipListMap<byte[], HRegionLocation> getTableLocations(
1526         final TableName tableName) {
1527       // find the map of cached locations for this table
1528       ConcurrentSkipListMap<byte[], HRegionLocation> result;
1529       result = this.cachedRegionLocations.get(tableName);
1530       // if tableLocations for this table isn't built yet, make one
1531       if (result == null) {
1532         result = new ConcurrentSkipListMap<byte[], HRegionLocation>(Bytes.BYTES_COMPARATOR);
1533         ConcurrentSkipListMap<byte[], HRegionLocation> old =
1534             this.cachedRegionLocations.putIfAbsent(tableName, result);
1535         if (old != null) {
1536           return old;
1537         }
1538       }
1539       return result;
1540     }
1541 
1542     @Override
1543     public void clearRegionCache() {
1544       this.cachedRegionLocations.clear();
1545       this.cachedServers.clear();
1546     }
1547 
1548     @Override
1549     public void clearRegionCache(final TableName tableName) {
1550       this.cachedRegionLocations.remove(tableName);
1551     }
1552 
1553     @Override
1554     public void clearRegionCache(final byte[] tableName) {
1555       clearRegionCache(TableName.valueOf(tableName));
1556     }
1557 
1558     /**
1559      * Put a newly discovered HRegionLocation into the cache.
1560      * @param tableName The table name.
1561      * @param source the source of the new location, if it's not coming from meta
1562      * @param location the new location
1563      */
1564     private void cacheLocation(final TableName tableName, final HRegionLocation source,
1565         final HRegionLocation location) {
1566       boolean isFromMeta = (source == null);
1567       byte [] startKey = location.getRegionInfo().getStartKey();
1568       ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1569       HRegionLocation oldLocation = tableLocations.putIfAbsent(startKey, location);
1570       boolean isNewCacheEntry = (oldLocation == null);
1571       if (isNewCacheEntry) {
1572         cachedServers.add(location.getServerName());
1573         return;
1574       }
1575       boolean updateCache;
1576       // If the server in cache sends us a redirect, assume it's always valid.
1577       if (oldLocation.equals(source)) {
1578         updateCache = true;
1579       } else {
1580         long newLocationSeqNum = location.getSeqNum();
1581         // Meta record is stale - some (probably the same) server has closed the region
1582         // with later seqNum and told us about the new location.
1583         boolean isStaleMetaRecord = isFromMeta && (oldLocation.getSeqNum() > newLocationSeqNum);
1584         // Same as above for redirect. However, in this case, if the number is equal to previous
1585         // record, the most common case is that first the region was closed with seqNum, and then
1586         // opened with the same seqNum; hence we will ignore the redirect.
1587         // There are so many corner cases with various combinations of opens and closes that
1588         // an additional counter on top of seqNum would be necessary to handle them all.
1589         boolean isStaleRedirect = !isFromMeta && (oldLocation.getSeqNum() >= newLocationSeqNum);
1590         boolean isStaleUpdate = (isStaleMetaRecord || isStaleRedirect);
1591         updateCache = (!isStaleUpdate);
1592       }
1593       if (updateCache) {
1594         tableLocations.replace(startKey, oldLocation, location);
1595         cachedServers.add(location.getServerName());
1596       }
1597     }
1598 
1599     // Map keyed by service name + regionserver to service stub implementation
1600     private final ConcurrentHashMap<String, Object> stubs =
1601       new ConcurrentHashMap<String, Object>();
1602     // Map of locks used creating service stubs per regionserver.
1603     private final ConcurrentHashMap<String, String> connectionLock =
1604       new ConcurrentHashMap<String, String>();
1605 
1606     /**
1607      * State of the MasterService connection/setup.
1608      */
1609     static class MasterServiceState {
1610       HConnection connection;
1611       MasterService.BlockingInterface stub;
1612       int userCount;
1613       long keepAliveUntil = Long.MAX_VALUE;
1614 
1615       MasterServiceState (final HConnection connection) {
1616         super();
1617         this.connection = connection;
1618       }
1619 
1620       @Override
1621       public String toString() {
1622         return "MasterService";
1623       }
1624 
1625       Object getStub() {
1626         return this.stub;
1627       }
1628 
1629       void clearStub() {
1630         this.stub = null;
1631       }
1632 
1633       boolean isMasterRunning() throws ServiceException {
1634         IsMasterRunningResponse response =
1635           this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1636         return response != null? response.getIsMasterRunning(): false;
1637       }
1638     }
1639 
1640     /**
1641      * Makes a client-side stub for master services. Sub-class to specialize.
1642      * Depends on hosting class so not static.  Exists so we avoid duplicating a bunch of code
1643      * when setting up the MasterMonitorService and MasterAdminService.
1644      */
1645     abstract class StubMaker {
1646       /**
1647        * Returns the name of the service stub being created.
1648        */
1649       protected abstract String getServiceName();
1650 
1651       /**
1652        * Make stub and cache it internal so can be used later doing the isMasterRunning call.
1653        * @param channel
1654        */
1655       protected abstract Object makeStub(final BlockingRpcChannel channel);
1656 
1657       /**
1658        * Once setup, check it works by doing isMasterRunning check.
1659        * @throws ServiceException
1660        */
1661       protected abstract void isMasterRunning() throws ServiceException;
1662 
1663       /**
1664        * Create a stub. Try once only.  It is not typed because there is no common type to
1665        * protobuf services nor their interfaces.  Let the caller do appropriate casting.
1666        * @return A stub for master services.
1667        * @throws IOException
1668        * @throws KeeperException
1669        * @throws ServiceException
1670        */
1671       private Object makeStubNoRetries() throws IOException, KeeperException, ServiceException {
1672         ZooKeeperKeepAliveConnection zkw;
1673         try {
1674           zkw = getKeepAliveZooKeeperWatcher();
1675         } catch (IOException e) {
1676           ExceptionUtil.rethrowIfInterrupt(e);
1677           throw new ZooKeeperConnectionException("Can't connect to ZooKeeper", e);
1678         }
1679         try {
1680           checkIfBaseNodeAvailable(zkw);
1681           ServerName sn = MasterAddressTracker.getMasterAddress(zkw);
1682           if (sn == null) {
1683             String msg = "ZooKeeper available but no active master location found";
1684             LOG.info(msg);
1685             throw new MasterNotRunningException(msg);
1686           }
1687           if (isDeadServer(sn)) {
1688             throw new MasterNotRunningException(sn + " is dead.");
1689           }
1690           // Use the security info interface name as our stub key
1691           String key = getStubKey(getServiceName(), sn.getHostAndPort());
1692           connectionLock.putIfAbsent(key, key);
1693           Object stub = null;
1694           synchronized (connectionLock.get(key)) {
1695             stub = stubs.get(key);
1696             if (stub == null) {
1697               BlockingRpcChannel channel = rpcClient.createBlockingRpcChannel(sn,
1698                 user, rpcTimeout);
1699               stub = makeStub(channel);
1700               isMasterRunning();
1701               stubs.put(key, stub);
1702             }
1703           }
1704           return stub;
1705         } finally {
1706           zkw.close();
1707         }
1708       }
1709 
1710       /**
1711        * Create a stub against the master.  Retry if necessary.
1712        * @return A stub to do <code>intf</code> against the master
1713        * @throws MasterNotRunningException
1714        */
1715       @edu.umd.cs.findbugs.annotations.SuppressWarnings (value="SWL_SLEEP_WITH_LOCK_HELD")
1716       Object makeStub() throws MasterNotRunningException {
1717         // The lock must be at the beginning to prevent multiple master creations
1718         //  (and leaks) in a multithread context
1719         synchronized (masterAndZKLock) {
1720           Exception exceptionCaught = null;
1721           Object stub = null;
1722           int tries = 0;
1723           while (!closed && stub == null) {
1724             tries++;
1725             try {
1726               stub = makeStubNoRetries();
1727             } catch (IOException e) {
1728               exceptionCaught = e;
1729             } catch (KeeperException e) {
1730               exceptionCaught = e;
1731             } catch (ServiceException e) {
1732               exceptionCaught = e;
1733             }
1734 
1735             if (exceptionCaught != null)
1736               // It failed. If it's not the last try, we're going to wait a little
1737               if (tries < numTries && !ExceptionUtil.isInterrupt(exceptionCaught)) {
1738                 // tries at this point is 1 or more; decrement to start from 0.
1739                 long pauseTime = ConnectionUtils.getPauseTime(pause, tries - 1);
1740                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1741                     " failed; retrying after sleep of " + pauseTime + ", exception=" +
1742                   exceptionCaught);
1743 
1744                 try {
1745                   Thread.sleep(pauseTime);
1746                 } catch (InterruptedException e) {
1747                   throw new MasterNotRunningException(
1748                       "Thread was interrupted while trying to connect to master.", e);
1749                 }
1750               } else {
1751                 // Enough tries, we stop now
1752                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1753                     " failed; no more retrying.", exceptionCaught);
1754                 throw new MasterNotRunningException(exceptionCaught);
1755               }
1756           }
1757 
1758           if (stub == null) {
1759             // implies this.closed true
1760             throw new MasterNotRunningException("Connection was closed while trying to get master");
1761           }
1762           return stub;
1763         }
1764       }
1765     }
1766 
1767     /**
1768      * Class to make a MasterServiceStubMaker stub.
1769      */
1770     class MasterServiceStubMaker extends StubMaker {
1771       private MasterService.BlockingInterface stub;
1772       @Override
1773       protected String getServiceName() {
1774         return MasterService.getDescriptor().getName();
1775       }
1776 
1777       @Override
1778       @edu.umd.cs.findbugs.annotations.SuppressWarnings("SWL_SLEEP_WITH_LOCK_HELD")
1779       MasterService.BlockingInterface makeStub() throws MasterNotRunningException {
1780         return (MasterService.BlockingInterface)super.makeStub();
1781       }
1782 
1783       @Override
1784       protected Object makeStub(BlockingRpcChannel channel) {
1785         this.stub = MasterService.newBlockingStub(channel);
1786         return this.stub;
1787       }
1788 
1789       @Override
1790       protected void isMasterRunning() throws ServiceException {
1791         this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1792       }
1793     }
1794 
1795     @Override
1796     public AdminService.BlockingInterface getAdmin(final ServerName serverName)
1797         throws IOException {
1798       return getAdmin(serverName, false);
1799     }
1800 
1801     @Override
1802     // Nothing is done w/ the 'master' parameter.  It is ignored.
1803     public AdminService.BlockingInterface getAdmin(final ServerName serverName,
1804       final boolean master)
1805     throws IOException {
1806       if (isDeadServer(serverName)) {
1807         throw new RegionServerStoppedException(serverName + " is dead.");
1808       }
1809       String key = getStubKey(AdminService.BlockingInterface.class.getName(),
1810         serverName.getHostAndPort());
1811       this.connectionLock.putIfAbsent(key, key);
1812       AdminService.BlockingInterface stub = null;
1813       synchronized (this.connectionLock.get(key)) {
1814         stub = (AdminService.BlockingInterface)this.stubs.get(key);
1815         if (stub == null) {
1816           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(serverName,
1817             user, this.rpcTimeout);
1818           stub = AdminService.newBlockingStub(channel);
1819           this.stubs.put(key, stub);
1820         }
1821       }
1822       return stub;
1823     }
1824 
1825     @Override
1826     public ClientService.BlockingInterface getClient(final ServerName sn)
1827     throws IOException {
1828       if (isDeadServer(sn)) {
1829         throw new RegionServerStoppedException(sn + " is dead.");
1830       }
1831       String key = getStubKey(ClientService.BlockingInterface.class.getName(), sn.getHostAndPort());
1832       this.connectionLock.putIfAbsent(key, key);
1833       ClientService.BlockingInterface stub = null;
1834       synchronized (this.connectionLock.get(key)) {
1835         stub = (ClientService.BlockingInterface)this.stubs.get(key);
1836         if (stub == null) {
1837           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(sn,
1838             user, this.rpcTimeout);
1839           stub = ClientService.newBlockingStub(channel);
1840           // In old days, after getting stub/proxy, we'd make a call.  We are not doing that here.
1841           // Just fail on first actual call rather than in here on setup.
1842           this.stubs.put(key, stub);
1843         }
1844       }
1845       return stub;
1846     }
1847 
1848     static String getStubKey(final String serviceName, final String rsHostnamePort) {
1849       return serviceName + "@" + rsHostnamePort;
1850     }
1851 
1852     private ZooKeeperKeepAliveConnection keepAliveZookeeper;
1853     private AtomicInteger keepAliveZookeeperUserCount = new AtomicInteger(0);
1854     private boolean canCloseZKW = true;
1855 
1856     // keepAlive time, in ms. No reason to make it configurable.
1857     private static final long keepAlive = 5 * 60 * 1000;
1858 
1859     /**
1860      * Retrieve a shared ZooKeeperWatcher. You must close it it once you've have finished with it.
1861      * @return The shared instance. Never returns null.
1862      */
1863     ZooKeeperKeepAliveConnection getKeepAliveZooKeeperWatcher()
1864       throws IOException {
1865       synchronized (masterAndZKLock) {
1866         if (keepAliveZookeeper == null) {
1867           if (this.closed) {
1868             throw new IOException(toString() + " closed");
1869           }
1870           // We don't check that our link to ZooKeeper is still valid
1871           // But there is a retry mechanism in the ZooKeeperWatcher itself
1872           keepAliveZookeeper = new ZooKeeperKeepAliveConnection(conf, this.toString(), this);
1873         }
1874         keepAliveZookeeperUserCount.incrementAndGet();
1875         keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1876         return keepAliveZookeeper;
1877       }
1878     }
1879 
1880     void releaseZooKeeperWatcher(final ZooKeeperWatcher zkw) {
1881       if (zkw == null){
1882         return;
1883       }
1884       if (keepAliveZookeeperUserCount.decrementAndGet() <= 0) {
1885         keepZooKeeperWatcherAliveUntil = System.currentTimeMillis() + keepAlive;
1886       }
1887     }
1888 
1889     /**
1890      * Creates a Chore thread to check the connections to master & zookeeper
1891      *  and close them when they reach their closing time (
1892      *  {@link MasterServiceState#keepAliveUntil} and
1893      *  {@link #keepZooKeeperWatcherAliveUntil}). Keep alive time is
1894      *  managed by the release functions and the variable {@link #keepAlive}
1895      */
1896     private static class DelayedClosing extends Chore implements Stoppable {
1897       private HConnectionImplementation hci;
1898       Stoppable stoppable;
1899 
1900       private DelayedClosing(
1901         HConnectionImplementation hci, Stoppable stoppable){
1902         super(
1903           "ZooKeeperWatcher and Master delayed closing for connection "+hci,
1904           60*1000, // We check every minutes
1905           stoppable);
1906         this.hci = hci;
1907         this.stoppable = stoppable;
1908       }
1909 
1910       static DelayedClosing createAndStart(HConnectionImplementation hci){
1911         Stoppable stoppable = new Stoppable() {
1912               private volatile boolean isStopped = false;
1913               @Override public void stop(String why) { isStopped = true;}
1914               @Override public boolean isStopped() {return isStopped;}
1915             };
1916 
1917         return new DelayedClosing(hci, stoppable);
1918       }
1919 
1920       protected void closeMasterProtocol(MasterServiceState protocolState) {
1921         if (System.currentTimeMillis() > protocolState.keepAliveUntil) {
1922           hci.closeMasterService(protocolState);
1923           protocolState.keepAliveUntil = Long.MAX_VALUE;
1924         }
1925       }
1926 
1927       @Override
1928       protected void chore() {
1929         synchronized (hci.masterAndZKLock) {
1930           if (hci.canCloseZKW) {
1931             if (System.currentTimeMillis() >
1932               hci.keepZooKeeperWatcherAliveUntil) {
1933 
1934               hci.closeZooKeeperWatcher();
1935               hci.keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1936             }
1937           }
1938           closeMasterProtocol(hci.masterServiceState);
1939           closeMasterProtocol(hci.masterServiceState);
1940         }
1941       }
1942 
1943       @Override
1944       public void stop(String why) {
1945         stoppable.stop(why);
1946       }
1947 
1948       @Override
1949       public boolean isStopped() {
1950         return stoppable.isStopped();
1951       }
1952     }
1953 
1954     private void closeZooKeeperWatcher() {
1955       synchronized (masterAndZKLock) {
1956         if (keepAliveZookeeper != null) {
1957           LOG.info("Closing zookeeper sessionid=0x" +
1958             Long.toHexString(
1959               keepAliveZookeeper.getRecoverableZooKeeper().getSessionId()));
1960           keepAliveZookeeper.internalClose();
1961           keepAliveZookeeper = null;
1962         }
1963         keepAliveZookeeperUserCount.set(0);
1964       }
1965     }
1966 
1967     final MasterServiceState masterServiceState = new MasterServiceState(this);
1968 
1969     @Override
1970     public MasterService.BlockingInterface getMaster() throws MasterNotRunningException {
1971       return getKeepAliveMasterService();
1972     }
1973 
1974     private void resetMasterServiceState(final MasterServiceState mss) {
1975       mss.userCount++;
1976       mss.keepAliveUntil = Long.MAX_VALUE;
1977     }
1978 
1979     @Override
1980     public MasterKeepAliveConnection getKeepAliveMasterService()
1981     throws MasterNotRunningException {
1982       synchronized (masterAndZKLock) {
1983         if (!isKeepAliveMasterConnectedAndRunning(this.masterServiceState)) {
1984           MasterServiceStubMaker stubMaker = new MasterServiceStubMaker();
1985           this.masterServiceState.stub = stubMaker.makeStub();
1986         }
1987         resetMasterServiceState(this.masterServiceState);
1988       }
1989       // Ugly delegation just so we can add in a Close method.
1990       final MasterService.BlockingInterface stub = this.masterServiceState.stub;
1991       return new MasterKeepAliveConnection() {
1992         MasterServiceState mss = masterServiceState;
1993         @Override
1994         public AddColumnResponse addColumn(RpcController controller, AddColumnRequest request)
1995         throws ServiceException {
1996           return stub.addColumn(controller, request);
1997         }
1998 
1999         @Override
2000         public DeleteColumnResponse deleteColumn(RpcController controller,
2001             DeleteColumnRequest request)
2002         throws ServiceException {
2003           return stub.deleteColumn(controller, request);
2004         }
2005 
2006         @Override
2007         public ModifyColumnResponse modifyColumn(RpcController controller,
2008             ModifyColumnRequest request)
2009         throws ServiceException {
2010           return stub.modifyColumn(controller, request);
2011         }
2012 
2013         @Override
2014         public MoveRegionResponse moveRegion(RpcController controller,
2015             MoveRegionRequest request) throws ServiceException {
2016           return stub.moveRegion(controller, request);
2017         }
2018 
2019         @Override
2020         public DispatchMergingRegionsResponse dispatchMergingRegions(
2021             RpcController controller, DispatchMergingRegionsRequest request)
2022             throws ServiceException {
2023           return stub.dispatchMergingRegions(controller, request);
2024         }
2025 
2026         @Override
2027         public AssignRegionResponse assignRegion(RpcController controller,
2028             AssignRegionRequest request) throws ServiceException {
2029           return stub.assignRegion(controller, request);
2030         }
2031 
2032         @Override
2033         public UnassignRegionResponse unassignRegion(RpcController controller,
2034             UnassignRegionRequest request) throws ServiceException {
2035           return stub.unassignRegion(controller, request);
2036         }
2037 
2038         @Override
2039         public OfflineRegionResponse offlineRegion(RpcController controller,
2040             OfflineRegionRequest request) throws ServiceException {
2041           return stub.offlineRegion(controller, request);
2042         }
2043 
2044         @Override
2045         public DeleteTableResponse deleteTable(RpcController controller,
2046             DeleteTableRequest request) throws ServiceException {
2047           return stub.deleteTable(controller, request);
2048         }
2049 
2050         @Override
2051         public EnableTableResponse enableTable(RpcController controller,
2052             EnableTableRequest request) throws ServiceException {
2053           return stub.enableTable(controller, request);
2054         }
2055 
2056         @Override
2057         public DisableTableResponse disableTable(RpcController controller,
2058             DisableTableRequest request) throws ServiceException {
2059           return stub.disableTable(controller, request);
2060         }
2061 
2062         @Override
2063         public ModifyTableResponse modifyTable(RpcController controller,
2064             ModifyTableRequest request) throws ServiceException {
2065           return stub.modifyTable(controller, request);
2066         }
2067 
2068         @Override
2069         public CreateTableResponse createTable(RpcController controller,
2070             CreateTableRequest request) throws ServiceException {
2071           return stub.createTable(controller, request);
2072         }
2073 
2074         @Override
2075         public ShutdownResponse shutdown(RpcController controller,
2076             ShutdownRequest request) throws ServiceException {
2077           return stub.shutdown(controller, request);
2078         }
2079 
2080         @Override
2081         public StopMasterResponse stopMaster(RpcController controller,
2082             StopMasterRequest request) throws ServiceException {
2083           return stub.stopMaster(controller, request);
2084         }
2085 
2086         @Override
2087         public BalanceResponse balance(RpcController controller,
2088             BalanceRequest request) throws ServiceException {
2089           return stub.balance(controller, request);
2090         }
2091 
2092         @Override
2093         public SetBalancerRunningResponse setBalancerRunning(
2094             RpcController controller, SetBalancerRunningRequest request)
2095             throws ServiceException {
2096           return stub.setBalancerRunning(controller, request);
2097         }
2098 
2099         @Override
2100         public IsBalancerEnabledResponse isBalancerEnabled(RpcController controller,
2101             IsBalancerEnabledRequest request) throws ServiceException {
2102           return stub.isBalancerEnabled(controller, request);
2103         }
2104 
2105         @Override
2106         public RunCatalogScanResponse runCatalogScan(RpcController controller,
2107             RunCatalogScanRequest request) throws ServiceException {
2108           return stub.runCatalogScan(controller, request);
2109         }
2110 
2111         @Override
2112         public EnableCatalogJanitorResponse enableCatalogJanitor(
2113             RpcController controller, EnableCatalogJanitorRequest request)
2114             throws ServiceException {
2115           return stub.enableCatalogJanitor(controller, request);
2116         }
2117 
2118         @Override
2119         public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(
2120             RpcController controller, IsCatalogJanitorEnabledRequest request)
2121             throws ServiceException {
2122           return stub.isCatalogJanitorEnabled(controller, request);
2123         }
2124 
2125         @Override
2126         public CoprocessorServiceResponse execMasterService(
2127             RpcController controller, CoprocessorServiceRequest request)
2128             throws ServiceException {
2129           return stub.execMasterService(controller, request);
2130         }
2131 
2132         @Override
2133         public SnapshotResponse snapshot(RpcController controller,
2134             SnapshotRequest request) throws ServiceException {
2135           return stub.snapshot(controller, request);
2136         }
2137 
2138         @Override
2139         public GetCompletedSnapshotsResponse getCompletedSnapshots(
2140             RpcController controller, GetCompletedSnapshotsRequest request)
2141             throws ServiceException {
2142           return stub.getCompletedSnapshots(controller, request);
2143         }
2144 
2145         @Override
2146         public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2147             DeleteSnapshotRequest request) throws ServiceException {
2148           return stub.deleteSnapshot(controller, request);
2149         }
2150 
2151         @Override
2152         public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2153             IsSnapshotDoneRequest request) throws ServiceException {
2154           return stub.isSnapshotDone(controller, request);
2155         }
2156 
2157         @Override
2158         public RestoreSnapshotResponse restoreSnapshot(
2159             RpcController controller, RestoreSnapshotRequest request)
2160             throws ServiceException {
2161           return stub.restoreSnapshot(controller, request);
2162         }
2163 
2164         @Override
2165         public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(
2166             RpcController controller, IsRestoreSnapshotDoneRequest request)
2167             throws ServiceException {
2168           return stub.isRestoreSnapshotDone(controller, request);
2169         }
2170 
2171         @Override
2172         public ExecProcedureResponse execProcedure(
2173             RpcController controller, ExecProcedureRequest request)
2174             throws ServiceException {
2175           return stub.execProcedure(controller, request);
2176         }
2177 
2178         @Override
2179         public IsProcedureDoneResponse isProcedureDone(RpcController controller,
2180             IsProcedureDoneRequest request) throws ServiceException {
2181           return stub.isProcedureDone(controller, request);
2182         }
2183 
2184         @Override
2185         public IsMasterRunningResponse isMasterRunning(
2186             RpcController controller, IsMasterRunningRequest request)
2187             throws ServiceException {
2188           return stub.isMasterRunning(controller, request);
2189         }
2190 
2191         @Override
2192         public ModifyNamespaceResponse modifyNamespace(RpcController controller,
2193             ModifyNamespaceRequest request)
2194         throws ServiceException {
2195           return stub.modifyNamespace(controller, request);
2196         }
2197 
2198         @Override
2199         public CreateNamespaceResponse createNamespace(RpcController controller, CreateNamespaceRequest request) throws ServiceException {
2200           return stub.createNamespace(controller, request);
2201         }
2202 
2203         @Override
2204         public DeleteNamespaceResponse deleteNamespace(RpcController controller, DeleteNamespaceRequest request) throws ServiceException {
2205           return stub.deleteNamespace(controller, request);
2206         }
2207 
2208         @Override
2209         public GetNamespaceDescriptorResponse getNamespaceDescriptor(RpcController controller, GetNamespaceDescriptorRequest request) throws ServiceException {
2210           return stub.getNamespaceDescriptor(controller, request);
2211         }
2212 
2213         @Override
2214         public ListNamespaceDescriptorsResponse listNamespaceDescriptors(RpcController controller, ListNamespaceDescriptorsRequest request) throws ServiceException {
2215           return stub.listNamespaceDescriptors(controller, request);
2216         }
2217 
2218         @Override
2219         public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(RpcController controller, ListTableDescriptorsByNamespaceRequest request) throws ServiceException {
2220           return stub.listTableDescriptorsByNamespace(controller, request);
2221         }
2222 
2223         @Override
2224         public ListTableNamesByNamespaceResponse listTableNamesByNamespace(RpcController controller,
2225               ListTableNamesByNamespaceRequest request) throws ServiceException {
2226           return stub.listTableNamesByNamespace(controller, request);
2227         }
2228 
2229         @Override
2230         public void close() {
2231           release(this.mss);
2232         }
2233 
2234         @Override
2235         public GetSchemaAlterStatusResponse getSchemaAlterStatus(
2236             RpcController controller, GetSchemaAlterStatusRequest request)
2237             throws ServiceException {
2238           return stub.getSchemaAlterStatus(controller, request);
2239         }
2240 
2241         @Override
2242         public GetTableDescriptorsResponse getTableDescriptors(
2243             RpcController controller, GetTableDescriptorsRequest request)
2244             throws ServiceException {
2245           return stub.getTableDescriptors(controller, request);
2246         }
2247 
2248         @Override
2249         public GetTableNamesResponse getTableNames(
2250             RpcController controller, GetTableNamesRequest request)
2251             throws ServiceException {
2252           return stub.getTableNames(controller, request);
2253         }
2254 
2255         @Override
2256         public GetClusterStatusResponse getClusterStatus(
2257             RpcController controller, GetClusterStatusRequest request)
2258             throws ServiceException {
2259           return stub.getClusterStatus(controller, request);
2260         }
2261 
2262         @Override
2263         public TruncateTableResponse truncateTable(RpcController controller,
2264             TruncateTableRequest request) throws ServiceException {
2265           return stub.truncateTable(controller, request);
2266         }
2267 
2268         @Override
2269         public SecurityCapabilitiesResponse getSecurityCapabilities(RpcController controller,
2270             SecurityCapabilitiesRequest request) throws ServiceException {
2271           return stub.getSecurityCapabilities(controller, request);
2272         }
2273       };
2274     }
2275 
2276 
2277     private static void release(MasterServiceState mss) {
2278       if (mss != null && mss.connection != null) {
2279         ((HConnectionImplementation)mss.connection).releaseMaster(mss);
2280       }
2281     }
2282 
2283     private boolean isKeepAliveMasterConnectedAndRunning(MasterServiceState mss) {
2284       if (mss.getStub() == null){
2285         return false;
2286       }
2287       try {
2288         return mss.isMasterRunning();
2289       } catch (UndeclaredThrowableException e) {
2290         // It's somehow messy, but we can receive exceptions such as
2291         //  java.net.ConnectException but they're not declared. So we catch it...
2292         LOG.info("Master connection is not running anymore", e.getUndeclaredThrowable());
2293         return false;
2294       } catch (ServiceException se) {
2295         LOG.warn("Checking master connection", se);
2296         return false;
2297       }
2298     }
2299 
2300     void releaseMaster(MasterServiceState mss) {
2301       if (mss.getStub() == null) return;
2302       synchronized (masterAndZKLock) {
2303         --mss.userCount;
2304         if (mss.userCount <= 0) {
2305           mss.keepAliveUntil = System.currentTimeMillis() + keepAlive;
2306         }
2307       }
2308     }
2309 
2310     private void closeMasterService(MasterServiceState mss) {
2311       if (mss.getStub() != null) {
2312         LOG.info("Closing master protocol: " + mss);
2313         mss.clearStub();
2314       }
2315       mss.userCount = 0;
2316     }
2317 
2318     /**
2319      * Immediate close of the shared master. Can be by the delayed close or when closing the
2320      * connection itself.
2321      */
2322     private void closeMaster() {
2323       synchronized (masterAndZKLock) {
2324         closeMasterService(masterServiceState);
2325       }
2326     }
2327 
2328     void updateCachedLocation(HRegionInfo hri, HRegionLocation source,
2329                               ServerName serverName, long seqNum) {
2330       HRegionLocation newHrl = new HRegionLocation(hri, serverName, seqNum);
2331       cacheLocation(hri.getTable(), source, newHrl);
2332     }
2333 
2334    /**
2335     * Deletes the cached location of the region if necessary, based on some error from source.
2336     * @param hri The region in question.
2337     * @param source The source of the error that prompts us to invalidate cache.
2338     */
2339    void deleteCachedLocation(HRegionInfo hri, HRegionLocation source) {
2340      ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(hri.getTable());
2341      tableLocations.remove(hri.getStartKey(), source);
2342    }
2343 
2344     @Override
2345     public void deleteCachedRegionLocation(final HRegionLocation location) {
2346       if (location == null) {
2347         return;
2348       }
2349 
2350       HRegionLocation removedLocation;
2351       TableName tableName = location.getRegionInfo().getTable();
2352       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
2353       removedLocation = tableLocations.remove(location.getRegionInfo().getStartKey());
2354       if (LOG.isDebugEnabled() && removedLocation != null) {
2355         LOG.debug("Removed " +
2356             location.getRegionInfo().getRegionNameAsString() +
2357             " for tableName=" + tableName +
2358             " from cache");
2359       }
2360     }
2361 
2362     /**
2363      * Update the location with the new value (if the exception is a RegionMovedException)
2364      * or delete it from the cache. Does nothing if we can be sure from the exception that
2365      * the location is still accurate, or if the cache has already been updated.
2366      * @param exception an object (to simplify user code) on which we will try to find a nested
2367      *                  or wrapped or both RegionMovedException
2368      * @param source server that is the source of the location update.
2369      */
2370     @Override
2371     public void updateCachedLocations(final TableName tableName, byte[] rowkey,
2372       final Object exception, final HRegionLocation source) {
2373       if (rowkey == null || tableName == null) {
2374         LOG.warn("Coding error, see method javadoc. row=" + (rowkey == null ? "null" : rowkey) +
2375             ", tableName=" + (tableName == null ? "null" : tableName));
2376         return;
2377       }
2378 
2379       if (source == null || source.getServerName() == null){
2380         // This should not happen, but let's secure ourselves.
2381         return;
2382       }
2383 
2384       // Is it something we have already updated?
2385       final HRegionLocation oldLocation = getCachedLocation(tableName, rowkey);
2386       if (oldLocation == null || !source.getServerName().equals(oldLocation.getServerName())) {
2387         // There is no such location in the cache (it's been removed already) or
2388         // the cache has already been refreshed with a different location.  => nothing to do
2389         return;
2390       }
2391 
2392       HRegionInfo regionInfo = oldLocation.getRegionInfo();
2393       Throwable cause = findException(exception);
2394       if (cause != null) {
2395         if (cause instanceof RegionTooBusyException || cause instanceof RegionOpeningException) {
2396           // We know that the region is still on this region server
2397           return;
2398         }
2399 
2400         if (cause instanceof RegionMovedException) {
2401           RegionMovedException rme = (RegionMovedException) cause;
2402           if (LOG.isTraceEnabled()) {
2403             LOG.trace("Region " + regionInfo.getRegionNameAsString() + " moved to " +
2404                 rme.getHostname() + ":" + rme.getPort() +
2405                 " according to " + source.getHostnamePort());
2406           }
2407           // We know that the region is not anymore on this region server, but we know
2408           //  the new location.
2409           updateCachedLocation(
2410               regionInfo, source, rme.getServerName(), rme.getLocationSeqNum());
2411           return;
2412         }
2413       }
2414 
2415       // If we're here, it means that can cannot be sure about the location, so we remove it from
2416       //  the cache.
2417       deleteCachedLocation(regionInfo, source);
2418     }
2419 
2420     @Override
2421     public void updateCachedLocations(final byte[] tableName, byte[] rowkey,
2422       final Object exception, final HRegionLocation source) {
2423       updateCachedLocations(TableName.valueOf(tableName), rowkey, exception, source);
2424     }
2425 
2426     @Override
2427     @Deprecated
2428     public void processBatch(List<? extends Row> list,
2429         final TableName tableName,
2430         ExecutorService pool,
2431         Object[] results) throws IOException, InterruptedException {
2432       // This belongs in HTable!!! Not in here.  St.Ack
2433 
2434       // results must be the same size as list
2435       if (results.length != list.size()) {
2436         throw new IllegalArgumentException(
2437           "argument results must be the same size as argument list");
2438       }
2439       processBatchCallback(list, tableName, pool, results, null);
2440     }
2441 
2442     @Override
2443     @Deprecated
2444     public void processBatch(List<? extends Row> list,
2445         final byte[] tableName,
2446         ExecutorService pool,
2447         Object[] results) throws IOException, InterruptedException {
2448       processBatch(list, TableName.valueOf(tableName), pool, results);
2449     }
2450 
2451     /**
2452      * Send the queries in parallel on the different region servers. Retries on failures.
2453      * If the method returns it means that there is no error, and the 'results' array will
2454      * contain no exception. On error, an exception is thrown, and the 'results' array will
2455      * contain results and exceptions.
2456      * @deprecated since 0.96 - Use {@link HTable#processBatchCallback} instead
2457      */
2458     @Override
2459     @Deprecated
2460     public <R> void processBatchCallback(
2461       List<? extends Row> list,
2462       TableName tableName,
2463       ExecutorService pool,
2464       Object[] results,
2465       Batch.Callback<R> callback)
2466       throws IOException, InterruptedException {
2467 
2468       // To fulfill the original contract, we have a special callback. This callback
2469       //  will set the results in the Object array.
2470       ObjectResultFiller<R> cb = new ObjectResultFiller<R>(results, callback);
2471       AsyncProcess<?> asyncProcess = createAsyncProcess(tableName, pool, cb, conf);
2472 
2473       // We're doing a submit all. This way, the originalIndex will match the initial list.
2474       asyncProcess.submitAll(list);
2475       asyncProcess.waitUntilDone();
2476 
2477       if (asyncProcess.hasError()) {
2478         throw asyncProcess.getErrors();
2479       }
2480     }
2481 
2482     @Override
2483     @Deprecated
2484     public <R> void processBatchCallback(
2485       List<? extends Row> list,
2486       byte[] tableName,
2487       ExecutorService pool,
2488       Object[] results,
2489       Batch.Callback<R> callback)
2490       throws IOException, InterruptedException {
2491       processBatchCallback(list, TableName.valueOf(tableName), pool, results, callback);
2492     }
2493 
2494     // For tests.
2495     protected <R> AsyncProcess createAsyncProcess(TableName tableName, ExecutorService pool,
2496            AsyncProcess.AsyncProcessCallback<R> callback, Configuration conf) {
2497       RpcControllerFactory controllerFactory = RpcControllerFactory.instantiate(conf);
2498       RpcRetryingCallerFactory callerFactory = RpcRetryingCallerFactory.instantiate(conf, this.stats);
2499       return new AsyncProcess<R>(this, tableName, pool, callback, conf, callerFactory,
2500         controllerFactory);
2501     }
2502 
2503     /**
2504      * Fill the result array for the interfaces using it.
2505      */
2506     private static class ObjectResultFiller<Res>
2507         implements AsyncProcess.AsyncProcessCallback<Res> {
2508 
2509       private final Object[] results;
2510       private Batch.Callback<Res> callback;
2511 
2512       ObjectResultFiller(Object[] results, Batch.Callback<Res> callback) {
2513         this.results = results;
2514         this.callback = callback;
2515       }
2516 
2517       @Override
2518       public void success(int pos, byte[] region, Row row, Res result) {
2519         assert pos < results.length;
2520         results[pos] = result;
2521         if (callback != null) {
2522           callback.update(region, row.getRow(), result);
2523         }
2524       }
2525 
2526       @Override
2527       public boolean failure(int pos, byte[] region, Row row, Throwable t) {
2528         assert pos < results.length;
2529         results[pos] = t;
2530         //Batch.Callback<Res> was not called on failure in 0.94. We keep this.
2531         return true; // we want to have this failure in the failures list.
2532       }
2533 
2534       @Override
2535       public boolean retriableFailure(int originalIndex, Row row, byte[] region,
2536                                       Throwable exception) {
2537         return true; // we retry
2538       }
2539     }
2540 
2541     @Override
2542     public ServerStatisticTracker getStatisticsTracker() {
2543       return this.stats;
2544     }
2545 
2546     @Override
2547     public ClientBackoffPolicy getBackoffPolicy() {
2548       return this.backoffPolicy;
2549     }
2550 
2551     /*
2552      * Return the number of cached region for a table. It will only be called
2553      * from a unit test.
2554      */
2555     int getNumberOfCachedRegionLocations(final TableName tableName) {
2556       Map<byte[], HRegionLocation> tableLocs = this.cachedRegionLocations.get(tableName);
2557       if (tableLocs == null) {
2558         return 0;
2559       }
2560       return tableLocs.values().size();
2561     }
2562 
2563     /**
2564      * Check the region cache to see whether a region is cached yet or not.
2565      * Called by unit tests.
2566      * @param tableName tableName
2567      * @param row row
2568      * @return Region cached or not.
2569      */
2570     boolean isRegionCached(TableName tableName, final byte[] row) {
2571       HRegionLocation location = getCachedLocation(tableName, row);
2572       return location != null;
2573     }
2574 
2575     @Override
2576     public void setRegionCachePrefetch(final TableName tableName,
2577         final boolean enable) {
2578       if (!enable) {
2579         regionCachePrefetchDisabledTables.add(Bytes.mapKey(tableName.getName()));
2580       }
2581       else {
2582         regionCachePrefetchDisabledTables.remove(Bytes.mapKey(tableName.getName()));
2583       }
2584     }
2585 
2586     @Override
2587     public void setRegionCachePrefetch(final byte[] tableName,
2588         final boolean enable) {
2589       setRegionCachePrefetch(TableName.valueOf(tableName), enable);
2590     }
2591 
2592     @Override
2593     public boolean getRegionCachePrefetch(TableName tableName) {
2594       return usePrefetch &&
2595           !regionCachePrefetchDisabledTables.contains(Bytes.mapKey(tableName.getName()));
2596     }
2597 
2598     @Override
2599     public boolean getRegionCachePrefetch(byte[] tableName) {
2600       return getRegionCachePrefetch(TableName.valueOf(tableName));
2601     }
2602 
2603     @Override
2604     public void abort(final String msg, Throwable t) {
2605       if (t instanceof KeeperException.SessionExpiredException
2606         && keepAliveZookeeper != null) {
2607         synchronized (masterAndZKLock) {
2608           if (keepAliveZookeeper != null) {
2609             LOG.warn("This client just lost it's session with ZooKeeper," +
2610               " closing it." +
2611               " It will be recreated next time someone needs it", t);
2612             closeZooKeeperWatcher();
2613           }
2614         }
2615       } else {
2616         if (t != null) {
2617           LOG.fatal(msg, t);
2618         } else {
2619           LOG.fatal(msg);
2620         }
2621         this.aborted = true;
2622         close();
2623         this.closed = true;
2624       }
2625     }
2626 
2627     @Override
2628     public boolean isClosed() {
2629       return this.closed;
2630     }
2631 
2632     @Override
2633     public boolean isAborted(){
2634       return this.aborted;
2635     }
2636 
2637     @Override
2638     public int getCurrentNrHRS() throws IOException {
2639       return this.registry.getCurrentNrHRS();
2640     }
2641 
2642     /**
2643      * Increment this client's reference count.
2644      */
2645     void incCount() {
2646       ++refCount;
2647     }
2648 
2649     /**
2650      * Decrement this client's reference count.
2651      */
2652     void decCount() {
2653       if (refCount > 0) {
2654         --refCount;
2655       }
2656     }
2657 
2658     /**
2659      * Return if this client has no reference
2660      *
2661      * @return true if this client has no reference; false otherwise
2662      */
2663     boolean isZeroReference() {
2664       return refCount == 0;
2665     }
2666 
2667     void internalClose() {
2668       if (this.closed) {
2669         return;
2670       }
2671       delayedClosing.stop("Closing connection");
2672       closeMaster();
2673       shutdownBatchPool();
2674       if (this.metrics != null) {
2675         this.metrics.shutdown();
2676       }
2677       this.closed = true;
2678       closeZooKeeperWatcher();
2679       this.stubs.clear();
2680       if (clusterStatusListener != null) {
2681         clusterStatusListener.close();
2682       }
2683       if (rpcClient != null) {
2684         rpcClient.stop();
2685       }
2686     }
2687 
2688     @Override
2689     public void close() {
2690       if (managed) {
2691         if (aborted) {
2692           HConnectionManager.deleteStaleConnection(this);
2693         } else {
2694           HConnectionManager.deleteConnection(this, false);
2695         }
2696       } else {
2697         internalClose();
2698       }
2699     }
2700 
2701     /**
2702      * Close the connection for good, regardless of what the current value of
2703      * {@link #refCount} is. Ideally, {@link #refCount} should be zero at this
2704      * point, which would be the case if all of its consumers close the
2705      * connection. However, on the off chance that someone is unable to close
2706      * the connection, perhaps because it bailed out prematurely, the method
2707      * below will ensure that this {@link HConnection} instance is cleaned up.
2708      * Caveat: The JVM may take an unknown amount of time to call finalize on an
2709      * unreachable object, so our hope is that every consumer cleans up after
2710      * itself, like any good citizen.
2711      */
2712     @Override
2713     protected void finalize() throws Throwable {
2714       super.finalize();
2715       // Pretend as if we are about to release the last remaining reference
2716       refCount = 1;
2717       close();
2718     }
2719 
2720     @Override
2721     public HTableDescriptor[] listTables() throws IOException {
2722       MasterKeepAliveConnection master = getKeepAliveMasterService();
2723       try {
2724         GetTableDescriptorsRequest req =
2725           RequestConverter.buildGetTableDescriptorsRequest((List<TableName>)null);
2726         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2727       } catch (ServiceException se) {
2728         throw ProtobufUtil.getRemoteException(se);
2729       } finally {
2730         master.close();
2731       }
2732     }
2733 
2734     @Override
2735     public String[] getTableNames() throws IOException {
2736       TableName[] tableNames = listTableNames();
2737       String result[] = new String[tableNames.length];
2738       for (int i = 0; i < tableNames.length; i++) {
2739         result[i] = tableNames[i].getNameAsString();
2740       }
2741       return result;
2742     }
2743 
2744     @Override
2745     public TableName[] listTableNames() throws IOException {
2746       MasterKeepAliveConnection master = getKeepAliveMasterService();
2747       try {
2748         return ProtobufUtil.getTableNameArray(master.getTableNames(null,
2749             GetTableNamesRequest.newBuilder().build())
2750           .getTableNamesList());
2751       } catch (ServiceException se) {
2752         throw ProtobufUtil.getRemoteException(se);
2753       } finally {
2754         master.close();
2755       }
2756     }
2757 
2758     @Override
2759     public HTableDescriptor[] getHTableDescriptorsByTableName(
2760         List<TableName> tableNames) throws IOException {
2761       if (tableNames == null || tableNames.isEmpty()) return new HTableDescriptor[0];
2762       MasterKeepAliveConnection master = getKeepAliveMasterService();
2763       try {
2764         GetTableDescriptorsRequest req =
2765           RequestConverter.buildGetTableDescriptorsRequest(tableNames);
2766         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2767       } catch (ServiceException se) {
2768         throw ProtobufUtil.getRemoteException(se);
2769       } finally {
2770         master.close();
2771       }
2772     }
2773 
2774     @Override
2775     public HTableDescriptor[] getHTableDescriptors(
2776         List<String> names) throws IOException {
2777       List<TableName> tableNames = new ArrayList(names.size());
2778       for(String name : names) {
2779         tableNames.add(TableName.valueOf(name));
2780       }
2781 
2782       return getHTableDescriptorsByTableName(tableNames);
2783     }
2784 
2785     @Override
2786     public NonceGenerator getNonceGenerator() {
2787       return this.nonceGenerator;
2788     }
2789 
2790     /**
2791      * Connects to the master to get the table descriptor.
2792      * @param tableName table name
2793      * @return
2794      * @throws IOException if the connection to master fails or if the table
2795      *  is not found.
2796      */
2797     @Override
2798     public HTableDescriptor getHTableDescriptor(final TableName tableName)
2799     throws IOException {
2800       if (tableName == null) return null;
2801       MasterKeepAliveConnection master = getKeepAliveMasterService();
2802       GetTableDescriptorsResponse htds;
2803       try {
2804         GetTableDescriptorsRequest req =
2805           RequestConverter.buildGetTableDescriptorsRequest(tableName);
2806         htds = master.getTableDescriptors(null, req);
2807       } catch (ServiceException se) {
2808         throw ProtobufUtil.getRemoteException(se);
2809       } finally {
2810         master.close();
2811       }
2812       if (!htds.getTableSchemaList().isEmpty()) {
2813         return HTableDescriptor.convert(htds.getTableSchemaList().get(0));
2814       }
2815       throw new TableNotFoundException(tableName.getNameAsString());
2816     }
2817 
2818     @Override
2819     public HTableDescriptor getHTableDescriptor(final byte[] tableName)
2820     throws IOException {
2821       return getHTableDescriptor(TableName.valueOf(tableName));
2822     }
2823 
2824     /**
2825      * @return true when this connection uses a {@link org.apache.hadoop.hbase.codec.Codec} and so
2826      *         supports cell blocks.
2827      */
2828     public boolean hasCellBlockSupport() {
2829       return this.rpcClient.hasCellBlockSupport();
2830     }
2831   }
2832 
2833   /**
2834    * The record of errors for servers.
2835    */
2836   static class ServerErrorTracker {
2837     // We need a concurrent map here, as we could have multiple threads updating it in parallel.
2838     private final ConcurrentMap<HRegionLocation, ServerErrors> errorsByServer =
2839         new ConcurrentHashMap<HRegionLocation, ServerErrors>();
2840     private final long canRetryUntil;
2841     private final int maxRetries;
2842     private final String startTrackingTime;
2843 
2844     public ServerErrorTracker(long timeout, int maxRetries) {
2845       this.maxRetries = maxRetries;
2846       this.canRetryUntil = EnvironmentEdgeManager.currentTimeMillis() + timeout;
2847       this.startTrackingTime = new Date().toString();
2848     }
2849 
2850     /**
2851      * We stop to retry when we have exhausted BOTH the number of retries and the time allocated.
2852      */
2853     boolean canRetryMore(int numRetry) {
2854       // If there is a single try we must not take into account the time.
2855       return numRetry < maxRetries || (maxRetries > 1 &&
2856           EnvironmentEdgeManager.currentTimeMillis() < this.canRetryUntil);
2857     }
2858 
2859     /**
2860      * Calculates the back-off time for a retrying request to a particular server.
2861      *
2862      * @param server    The server in question.
2863      * @param basePause The default hci pause.
2864      * @return The time to wait before sending next request.
2865      */
2866     long calculateBackoffTime(HRegionLocation server, long basePause) {
2867       long result;
2868       ServerErrors errorStats = errorsByServer.get(server);
2869       if (errorStats != null) {
2870         result = ConnectionUtils.getPauseTime(basePause, errorStats.retries.get());
2871       } else {
2872         result = 0; // yes, if the server is not in our list we don't wait before retrying.
2873       }
2874       return result;
2875     }
2876 
2877     /**
2878      * Reports that there was an error on the server to do whatever bean-counting necessary.
2879      *
2880      * @param server The server in question.
2881      */
2882     void reportServerError(HRegionLocation server) {
2883       ServerErrors errors = errorsByServer.get(server);
2884       if (errors != null) {
2885         errors.addError();
2886       } else {
2887         errors = errorsByServer.putIfAbsent(server, new ServerErrors());
2888         if (errors != null){
2889           errors.addError();
2890         }
2891       }
2892     }
2893 
2894     String getStartTrackingTime() {
2895       return startTrackingTime;
2896     }
2897 
2898     /**
2899      * The record of errors for a server.
2900      */
2901     private static class ServerErrors {
2902       public final AtomicInteger retries = new AtomicInteger(0);
2903 
2904       public void addError() {
2905         retries.incrementAndGet();
2906       }
2907     }
2908   }
2909 
2910   /**
2911    * Look for an exception we know in the remote exception:
2912    * - hadoop.ipc wrapped exceptions
2913    * - nested exceptions
2914    *
2915    * Looks for: RegionMovedException / RegionOpeningException / RegionTooBusyException
2916    * @return null if we didn't find the exception, the exception otherwise.
2917    */
2918   public static Throwable findException(Object exception) {
2919     if (exception == null || !(exception instanceof Throwable)) {
2920       return null;
2921     }
2922     Throwable cur = (Throwable) exception;
2923     while (cur != null) {
2924       if (cur instanceof RegionMovedException || cur instanceof RegionOpeningException
2925           || cur instanceof RegionTooBusyException) {
2926         return cur;
2927       }
2928       if (cur instanceof RemoteException) {
2929         RemoteException re = (RemoteException) cur;
2930         cur = re.unwrapRemoteException(
2931             RegionOpeningException.class, RegionMovedException.class,
2932             RegionTooBusyException.class);
2933         if (cur == null) {
2934           cur = re.unwrapRemoteException();
2935         }
2936         // unwrapRemoteException can return the exception given as a parameter when it cannot
2937         //  unwrap it. In this case, there is no need to look further
2938         // noinspection ObjectEquality
2939         if (cur == re) {
2940           return null;
2941         }
2942       } else {
2943         cur = cur.getCause();
2944       }
2945     }
2946 
2947     return null;
2948   }
2949 
2950   /**
2951    * Set the number of retries to use serverside when trying to communicate
2952    * with another server over {@link HConnection}.  Used updating catalog
2953    * tables, etc.  Call this method before we create any Connections.
2954    * @param c The Configuration instance to set the retries into.
2955    * @param log Used to log what we set in here.
2956    */
2957   public static void setServerSideHConnectionRetries(final Configuration c, final String sn,
2958       final Log log) {
2959     int hcRetries = c.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
2960       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
2961     // Go big.  Multiply by 10.  If we can't get to meta after this many retries
2962     // then something seriously wrong.
2963     int serversideMultiplier = c.getInt("hbase.client.serverside.retries.multiplier", 10);
2964     int retries = hcRetries * serversideMultiplier;
2965     c.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
2966     log.debug(sn + " HConnection server-to-server retries=" + retries);
2967   }
2968 }