View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import static org.apache.hadoop.hbase.client.MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY;
22  
23  import java.io.Closeable;
24  import java.io.IOException;
25  import java.io.InterruptedIOException;
26  import java.lang.reflect.Constructor;
27  import java.lang.reflect.UndeclaredThrowableException;
28  import java.net.SocketException;
29  import java.util.ArrayList;
30  import java.util.Date;
31  import java.util.HashSet;
32  import java.util.LinkedHashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.Map.Entry;
36  import java.util.NavigableMap;
37  import java.util.Set;
38  import java.util.concurrent.ConcurrentHashMap;
39  import java.util.concurrent.ConcurrentMap;
40  import java.util.concurrent.ConcurrentSkipListMap;
41  import java.util.concurrent.ConcurrentSkipListSet;
42  import java.util.concurrent.CopyOnWriteArraySet;
43  import java.util.concurrent.ExecutorService;
44  import java.util.concurrent.LinkedBlockingQueue;
45  import java.util.concurrent.ThreadPoolExecutor;
46  import java.util.concurrent.TimeUnit;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  import java.util.concurrent.atomic.AtomicInteger;
49  
50  import org.apache.commons.logging.Log;
51  import org.apache.commons.logging.LogFactory;
52  import org.apache.hadoop.hbase.classification.InterfaceAudience;
53  import org.apache.hadoop.hbase.classification.InterfaceStability;
54  import org.apache.hadoop.conf.Configuration;
55  import org.apache.hadoop.hbase.Chore;
56  import org.apache.hadoop.hbase.HBaseConfiguration;
57  import org.apache.hadoop.hbase.HConstants;
58  import org.apache.hadoop.hbase.HRegionInfo;
59  import org.apache.hadoop.hbase.HRegionLocation;
60  import org.apache.hadoop.hbase.HTableDescriptor;
61  import org.apache.hadoop.hbase.MasterNotRunningException;
62  import org.apache.hadoop.hbase.RegionTooBusyException;
63  import org.apache.hadoop.hbase.ServerName;
64  import org.apache.hadoop.hbase.Stoppable;
65  import org.apache.hadoop.hbase.TableName;
66  import org.apache.hadoop.hbase.TableNotEnabledException;
67  import org.apache.hadoop.hbase.TableNotFoundException;
68  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
69  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
70  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
71  import org.apache.hadoop.hbase.client.backoff.ClientBackoffPolicy;
72  import org.apache.hadoop.hbase.client.backoff.ClientBackoffPolicyFactory;
73  import org.apache.hadoop.hbase.client.coprocessor.Batch;
74  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
75  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
76  import org.apache.hadoop.hbase.ipc.RpcClient;
77  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
78  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
79  import org.apache.hadoop.hbase.protobuf.RequestConverter;
80  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
81  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService;
82  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
83  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
84  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
85  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
86  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionRequest;
87  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
88  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
89  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
90  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceRequest;
91  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceResponse;
92  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableRequest;
93  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
94  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnRequest;
95  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
96  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceRequest;
97  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceResponse;
98  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotRequest;
99  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
100 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableRequest;
101 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
102 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableRequest;
103 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
104 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsRequest;
105 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
106 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorRequest;
107 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
108 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableRequest;
109 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
110 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest;
111 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse;
112 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusRequest;
113 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusResponse;
114 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsRequest;
115 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
116 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorRequest;
117 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorResponse;
118 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusRequest;
119 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusResponse;
120 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
123 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesResponse;
124 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsBalancerEnabledRequest;
125 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsBalancerEnabledResponse;
126 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
128 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
129 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
130 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest;
131 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse;
132 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest;
133 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
134 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneRequest;
135 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
136 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsRequest;
137 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
138 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
139 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
140 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
141 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
142 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
143 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnRequest;
144 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
145 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceRequest;
146 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
147 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableRequest;
148 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
149 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionRequest;
150 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
151 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionRequest;
152 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
153 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotRequest;
154 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
155 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanRequest;
156 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
157 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SecurityCapabilitiesRequest;
158 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SecurityCapabilitiesResponse;
159 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
160 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
161 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotRequest;
162 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
163 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownRequest;
164 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
165 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterRequest;
166 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
167 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableRequest;
168 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableResponse;
169 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionRequest;
170 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
171 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
172 import org.apache.hadoop.hbase.security.User;
173 import org.apache.hadoop.hbase.security.UserProvider;
174 import org.apache.hadoop.hbase.util.Bytes;
175 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
176 import org.apache.hadoop.hbase.util.ExceptionUtil;
177 import org.apache.hadoop.hbase.util.Threads;
178 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
179 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
180 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
181 import org.apache.hadoop.ipc.RemoteException;
182 import org.apache.zookeeper.KeeperException;
183 
184 import com.google.common.annotations.VisibleForTesting;
185 import com.google.protobuf.BlockingRpcChannel;
186 import com.google.protobuf.RpcController;
187 import com.google.protobuf.ServiceException;
188 
189 /**
190  * A non-instantiable class that manages creation of {@link HConnection}s.
191  * <p>The simplest way to use this class is by using {@link #createConnection(Configuration)}.
192  * This creates a new {@link HConnection} to the cluster that is managed by the caller.
193  * From this {@link HConnection} {@link HTableInterface} implementations are retrieved
194  * with {@link HConnection#getTable(byte[])}. Example:
195  * <pre>
196  * {@code
197  * HConnection connection = HConnectionManager.createConnection(config);
198  * HTableInterface table = connection.getTable("table1");
199  * try {
200  *   // Use the table as needed, for a single operation and a single thread
201  * } finally {
202  *   table.close();
203  *   connection.close();
204  * }
205  * }</pre>
206  * <p>This class has a static Map of {@link HConnection} instances keyed by
207  * {@link HConnectionKey}; A {@link HConnectionKey} is identified by a set of
208  * {@link Configuration} properties. Invocations of {@link #getConnection(Configuration)}
209  * that pass the same {@link Configuration} instance will return the same
210  * {@link  HConnection} instance ONLY WHEN the set of properties are the same
211  * (i.e. if you change properties in your {@link Configuration} instance, such as RPC timeout,
212  * the codec used, HBase will create a new {@link HConnection} instance. For more details on
213  * how this is done see {@link HConnectionKey}).
214  * <p>Sharing {@link HConnection} instances is usually what you want; all clients
215  * of the {@link HConnection} instances share the HConnections' cache of Region
216  * locations rather than each having to discover for itself the location of meta, etc.
217  * But sharing connections makes clean up of {@link HConnection} instances a little awkward.
218  * Currently, clients cleanup by calling {@link #deleteConnection(Configuration)}. This will
219  * shutdown the zookeeper connection the HConnection was using and clean up all
220  * HConnection resources as well as stopping proxies to servers out on the
221  * cluster. Not running the cleanup will not end the world; it'll
222  * just stall the closeup some and spew some zookeeper connection failed
223  * messages into the log.  Running the cleanup on a {@link HConnection} that is
224  * subsequently used by another will cause breakage so be careful running
225  * cleanup.
226  * <p>To create a {@link HConnection} that is not shared by others, you can
227  * set property "hbase.client.instance.id" to a unique value for your {@link Configuration}
228  * instance, like the following:
229  * <pre>
230  * {@code
231  * conf.set("hbase.client.instance.id", "12345");
232  * HConnection connection = HConnectionManager.getConnection(conf);
233  * // Use the connection to your hearts' delight and then when done...
234  * conf.set("hbase.client.instance.id", "12345");
235  * HConnectionManager.deleteConnection(conf, true);
236  * }
237  * </pre>
238  * <p>Cleanup used to be done inside in a shutdown hook.  On startup we'd
239  * register a shutdown hook that called {@link #deleteAllConnections()}
240  * on its way out but the order in which shutdown hooks run is not defined so
241  * were problematic for clients of HConnection that wanted to register their
242  * own shutdown hooks so we removed ours though this shifts the onus for
243  * cleanup to the client.
244  */
245 @SuppressWarnings("serial")
246 @InterfaceAudience.Public
247 @InterfaceStability.Evolving
248 public class HConnectionManager {
249   static final Log LOG = LogFactory.getLog(HConnectionManager.class);
250 
251   public static final String RETRIES_BY_SERVER_KEY = "hbase.client.retries.by.server";
252   private static final String CLIENT_NONCES_ENABLED_KEY = "hbase.client.nonces.enabled";
253 
254   // An LRU Map of HConnectionKey -> HConnection (TableServer).  All
255   // access must be synchronized.  This map is not private because tests
256   // need to be able to tinker with it.
257   static final Map<HConnectionKey, HConnectionImplementation> CONNECTION_INSTANCES;
258 
259   public static final int MAX_CACHED_CONNECTION_INSTANCES;
260 
261   /**
262    * Global nonceGenerator shared per client.Currently there's no reason to limit its scope.
263    * Once it's set under nonceGeneratorCreateLock, it is never unset or changed.
264    */
265   private static volatile NonceGenerator nonceGenerator = null;
266   /** The nonce generator lock. Only taken when creating HConnection, which gets a private copy. */
267   private static Object nonceGeneratorCreateLock = new Object();
268 
269   static {
270     // We set instances to one more than the value specified for {@link
271     // HConstants#ZOOKEEPER_MAX_CLIENT_CNXNS}. By default, the zk default max
272     // connections to the ensemble from the one client is 30, so in that case we
273     // should run into zk issues before the LRU hit this value of 31.
274     MAX_CACHED_CONNECTION_INSTANCES = HBaseConfiguration.create().getInt(
275       HConstants.ZOOKEEPER_MAX_CLIENT_CNXNS, HConstants.DEFAULT_ZOOKEPER_MAX_CLIENT_CNXNS) + 1;
276     CONNECTION_INSTANCES = new LinkedHashMap<HConnectionKey, HConnectionImplementation>(
277         (int) (MAX_CACHED_CONNECTION_INSTANCES / 0.75F) + 1, 0.75F, true) {
278       @Override
279       protected boolean removeEldestEntry(
280           Map.Entry<HConnectionKey, HConnectionImplementation> eldest) {
281          return size() > MAX_CACHED_CONNECTION_INSTANCES;
282        }
283     };
284   }
285 
286   /*
287    * Non-instantiable.
288    */
289   private HConnectionManager() {
290     super();
291   }
292 
293   /**
294    * @param conn The connection for which to replace the generator.
295    * @param cnm Replaces the nonce generator used, for testing.
296    * @return old nonce generator.
297    */
298   @VisibleForTesting
299   public static NonceGenerator injectNonceGeneratorForTesting(
300       HConnection conn, NonceGenerator cnm) {
301     NonceGenerator ng = conn.getNonceGenerator();
302     LOG.warn("Nonce generator is being replaced by test code for " + cnm.getClass().getName());
303     ((HConnectionImplementation)conn).nonceGenerator = cnm;
304     return ng;
305   }
306 
307   /**
308    * Get the connection that goes with the passed <code>conf</code> configuration instance.
309    * If no current connection exists, method creates a new connection and keys it using
310    * connection-specific properties from the passed {@link Configuration}; see
311    * {@link HConnectionKey}.
312    * @param conf configuration
313    * @return HConnection object for <code>conf</code>
314    * @throws ZooKeeperConnectionException
315    */
316   @Deprecated
317   public static HConnection getConnection(final Configuration conf)
318   throws IOException {
319     HConnectionKey connectionKey = new HConnectionKey(conf);
320     synchronized (CONNECTION_INSTANCES) {
321       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
322       if (connection == null) {
323         connection = (HConnectionImplementation)createConnection(conf, true);
324         CONNECTION_INSTANCES.put(connectionKey, connection);
325       } else if (connection.isClosed()) {
326         HConnectionManager.deleteConnection(connectionKey, true);
327         connection = (HConnectionImplementation)createConnection(conf, true);
328         CONNECTION_INSTANCES.put(connectionKey, connection);
329       }
330       connection.incCount();
331       return connection;
332     }
333   }
334 
335   /**
336    * Create a new HConnection instance using the passed <code>conf</code> instance.
337    * <p>Note: This bypasses the usual HConnection life cycle management done by
338    * {@link #getConnection(Configuration)}. The caller is responsible for
339    * calling {@link HConnection#close()} on the returned connection instance.
340    *
341    * This is the recommended way to create HConnections.
342    * {@code
343    * HConnection connection = HConnectionManager.createConnection(conf);
344    * HTableInterface table = connection.getTable("mytable");
345    * table.get(...);
346    * ...
347    * table.close();
348    * connection.close();
349    * }
350    *
351    * @param conf configuration
352    * @return HConnection object for <code>conf</code>
353    * @throws ZooKeeperConnectionException
354    */
355   public static HConnection createConnection(Configuration conf)
356   throws IOException {
357     UserProvider provider = UserProvider.instantiate(conf);
358     return createConnection(conf, false, null, provider.getCurrent());
359   }
360 
361   /**
362    * Create a new HConnection instance using the passed <code>conf</code> instance.
363    * <p>Note: This bypasses the usual HConnection life cycle management done by
364    * {@link #getConnection(Configuration)}. The caller is responsible for
365    * calling {@link HConnection#close()} on the returned connection instance.
366    * This is the recommended way to create HConnections.
367    * {@code
368    * ExecutorService pool = ...;
369    * HConnection connection = HConnectionManager.createConnection(conf, pool);
370    * HTableInterface table = connection.getTable("mytable");
371    * table.get(...);
372    * ...
373    * table.close();
374    * connection.close();
375    * }
376    * @param conf configuration
377    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
378    * @return HConnection object for <code>conf</code>
379    * @throws ZooKeeperConnectionException
380    */
381   public static HConnection createConnection(Configuration conf, ExecutorService pool)
382   throws IOException {
383     UserProvider provider = UserProvider.instantiate(conf);
384     return createConnection(conf, false, pool, provider.getCurrent());
385   }
386 
387   /**
388    * Create a new HConnection instance using the passed <code>conf</code> instance.
389    * <p>Note: This bypasses the usual HConnection life cycle management done by
390    * {@link #getConnection(Configuration)}. The caller is responsible for
391    * calling {@link HConnection#close()} on the returned connection instance.
392    * This is the recommended way to create HConnections.
393    * {@code
394    * ExecutorService pool = ...;
395    * HConnection connection = HConnectionManager.createConnection(conf, pool);
396    * HTableInterface table = connection.getTable("mytable");
397    * table.get(...);
398    * ...
399    * table.close();
400    * connection.close();
401    * }
402    * @param conf configuration
403    * @param user the user the connection is for
404    * @return HConnection object for <code>conf</code>
405    * @throws ZooKeeperConnectionException
406    */
407   public static HConnection createConnection(Configuration conf, User user)
408   throws IOException {
409     return createConnection(conf, false, null, user);
410   }
411 
412   /**
413    * Create a new HConnection instance using the passed <code>conf</code> instance.
414    * <p>Note: This bypasses the usual HConnection life cycle management done by
415    * {@link #getConnection(Configuration)}. The caller is responsible for
416    * calling {@link HConnection#close()} on the returned connection instance.
417    * This is the recommended way to create HConnections.
418    * {@code
419    * ExecutorService pool = ...;
420    * HConnection connection = HConnectionManager.createConnection(conf, pool);
421    * HTableInterface table = connection.getTable("mytable");
422    * table.get(...);
423    * ...
424    * table.close();
425    * connection.close();
426    * }
427    * @param conf configuration
428    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
429    * @param user the user the connection is for
430    * @return HConnection object for <code>conf</code>
431    * @throws ZooKeeperConnectionException
432    */
433   public static HConnection createConnection(Configuration conf, ExecutorService pool, User user)
434   throws IOException {
435     return createConnection(conf, false, pool, user);
436   }
437 
438   @Deprecated
439   static HConnection createConnection(final Configuration conf, final boolean managed)
440       throws IOException {
441     UserProvider provider = UserProvider.instantiate(conf);
442     return createConnection(conf, managed, null, provider.getCurrent());
443   }
444 
445   @Deprecated
446   static HConnection createConnection(final Configuration conf, final boolean managed,
447       final ExecutorService pool, final User user)
448   throws IOException {
449     String className = conf.get("hbase.client.connection.impl",
450       HConnectionManager.HConnectionImplementation.class.getName());
451     Class<?> clazz = null;
452     try {
453       clazz = Class.forName(className);
454     } catch (ClassNotFoundException e) {
455       throw new IOException(e);
456     }
457     try {
458       // Default HCM#HCI is not accessible; make it so before invoking.
459       Constructor<?> constructor =
460         clazz.getDeclaredConstructor(Configuration.class,
461           boolean.class, ExecutorService.class, User.class);
462       constructor.setAccessible(true);
463       return (HConnection) constructor.newInstance(conf, managed, pool, user);
464     } catch (Exception e) {
465       throw new IOException(e);
466     }
467   }
468 
469   /**
470    * Delete connection information for the instance specified by passed configuration.
471    * If there are no more references to the designated connection connection, this method will
472    * then close connection to the zookeeper ensemble and let go of all associated resources.
473    *
474    * @param conf configuration whose identity is used to find {@link HConnection} instance.
475    * @deprecated
476    */
477   public static void deleteConnection(Configuration conf) {
478     deleteConnection(new HConnectionKey(conf), false);
479   }
480 
481   /**
482    * Cleanup a known stale connection.
483    * This will then close connection to the zookeeper ensemble and let go of all resources.
484    *
485    * @param connection
486    * @deprecated
487    */
488   public static void deleteStaleConnection(HConnection connection) {
489     deleteConnection(connection, true);
490   }
491 
492   /**
493    * Delete information for all connections. Close or not the connection, depending on the
494    *  staleConnection boolean and the ref count. By default, you should use it with
495    *  staleConnection to true.
496    * @deprecated
497    */
498   public static void deleteAllConnections(boolean staleConnection) {
499     synchronized (CONNECTION_INSTANCES) {
500       Set<HConnectionKey> connectionKeys = new HashSet<HConnectionKey>();
501       connectionKeys.addAll(CONNECTION_INSTANCES.keySet());
502       for (HConnectionKey connectionKey : connectionKeys) {
503         deleteConnection(connectionKey, staleConnection);
504       }
505       CONNECTION_INSTANCES.clear();
506     }
507   }
508 
509   /**
510    * Delete information for all connections..
511    * @deprecated kept for backward compatibility, but the behavior is broken. HBASE-8983
512    */
513   @Deprecated
514   public static void deleteAllConnections() {
515     deleteAllConnections(false);
516   }
517 
518 
519   @Deprecated
520   private static void deleteConnection(HConnection connection, boolean staleConnection) {
521     synchronized (CONNECTION_INSTANCES) {
522       for (Entry<HConnectionKey, HConnectionImplementation> e: CONNECTION_INSTANCES.entrySet()) {
523         if (e.getValue() == connection) {
524           deleteConnection(e.getKey(), staleConnection);
525           break;
526         }
527       }
528     }
529   }
530 
531   @Deprecated
532   private static void deleteConnection(HConnectionKey connectionKey, boolean staleConnection) {
533     synchronized (CONNECTION_INSTANCES) {
534       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
535       if (connection != null) {
536         connection.decCount();
537         if (connection.isZeroReference() || staleConnection) {
538           CONNECTION_INSTANCES.remove(connectionKey);
539           connection.internalClose();
540         }
541       } else {
542         LOG.error("Connection not found in the list, can't delete it "+
543           "(connection key=" + connectionKey + "). May be the key was modified?", new Exception());
544       }
545     }
546   }
547 
548   /**
549    * It is provided for unit test cases which verify the behavior of region
550    * location cache prefetch.
551    * @return Number of cached regions for the table.
552    * @throws ZooKeeperConnectionException
553    */
554   static int getCachedRegionCount(Configuration conf, final TableName tableName)
555   throws IOException {
556     return execute(new HConnectable<Integer>(conf) {
557       @Override
558       public Integer connect(HConnection connection) {
559         return ((HConnectionImplementation)connection).getNumberOfCachedRegionLocations(tableName);
560       }
561     });
562   }
563 
564   /**
565    * This convenience method invokes the given {@link HConnectable#connect}
566    * implementation using a {@link HConnection} instance that lasts just for the
567    * duration of the invocation.
568    *
569    * @param <T> the return type of the connect method
570    * @param connectable the {@link HConnectable} instance
571    * @return the value returned by the connect method
572    * @throws IOException
573    */
574   @InterfaceAudience.Private
575   public static <T> T execute(HConnectable<T> connectable) throws IOException {
576     if (connectable == null || connectable.conf == null) {
577       return null;
578     }
579     Configuration conf = connectable.conf;
580     HConnection connection = HConnectionManager.getConnection(conf);
581     boolean connectSucceeded = false;
582     try {
583       T returnValue = connectable.connect(connection);
584       connectSucceeded = true;
585       return returnValue;
586     } finally {
587       try {
588         connection.close();
589       } catch (Exception e) {
590         ExceptionUtil.rethrowIfInterrupt(e);
591         if (connectSucceeded) {
592           throw new IOException("The connection to " + connection
593               + " could not be deleted.", e);
594         }
595       }
596     }
597   }
598 
599   /** Encapsulates connection to zookeeper and regionservers.*/
600   @InterfaceAudience.Private
601   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
602       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
603       justification="Access to the conncurrent hash map is under a lock so should be fine.")
604   public static class HConnectionImplementation implements HConnection, Closeable {
605     static final Log LOG = LogFactory.getLog(HConnectionImplementation.class);
606     private final long pause;
607     private final int numTries;
608     final int rpcTimeout;
609     private NonceGenerator nonceGenerator = null;
610     private final boolean usePrefetch;
611     private final int prefetchRegionLimit;
612 
613     private volatile boolean closed;
614     private volatile boolean aborted;
615 
616     // package protected for the tests
617     ClusterStatusListener clusterStatusListener;
618 
619     private final Object userRegionLock = new Object();
620     private final Object metaRegionLock = new Object();
621 
622     // We have a single lock for master & zk to prevent deadlocks. Having
623     //  one lock for ZK and one lock for master is not possible:
624     //  When creating a connection to master, we need a connection to ZK to get
625     //  its address. But another thread could have taken the ZK lock, and could
626     //  be waiting for the master lock => deadlock.
627     private final Object masterAndZKLock = new Object();
628 
629     private long keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
630     private final DelayedClosing delayedClosing =
631       DelayedClosing.createAndStart(this);
632 
633     // thread executor shared by all HTableInterface instances created
634     // by this connection
635     private volatile ExecutorService batchPool = null;
636     private volatile boolean cleanupPool = false;
637 
638     private final Configuration conf;
639 
640     // cache the configuration value for tables so that we can avoid calling
641     // the expensive Configuration to fetch the value multiple times.
642     private final TableConfiguration tableConfig;
643 
644     // Client rpc instance.
645     private RpcClient rpcClient;
646 
647     private final MetricsConnection metrics;
648 
649     /**
650       * Map of table to table {@link HRegionLocation}s.
651       */
652     private final ConcurrentMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>
653         cachedRegionLocations =
654       new ConcurrentHashMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>();
655 
656     // The presence of a server in the map implies it's likely that there is an
657     // entry in cachedRegionLocations that map to this server; but the absence
658     // of a server in this map guarentees that there is no entry in cache that
659     // maps to the absent server.
660     // The access to this attribute must be protected by a lock on cachedRegionLocations
661     private final Set<ServerName> cachedServers = new ConcurrentSkipListSet<ServerName>();
662 
663     // region cache prefetch is enabled by default. this set contains all
664     // tables whose region cache prefetch are disabled.
665     private final Set<Integer> regionCachePrefetchDisabledTables =
666       new CopyOnWriteArraySet<Integer>();
667 
668     private int refCount;
669 
670     // indicates whether this connection's life cycle is managed (by us)
671     private boolean managed;
672 
673     protected User user;
674 
675     private RpcRetryingCallerFactory rpcCallerFactory;
676 
677     private RpcControllerFactory rpcControllerFactory;
678 
679     // single tracker per connection
680     private final ServerStatisticTracker stats;
681 
682     private final ClientBackoffPolicy backoffPolicy;
683 
684     /**
685      * Cluster registry of basic info such as clusterid and meta region location.
686      */
687      Registry registry;
688 
689      HConnectionImplementation(Configuration conf, boolean managed) throws IOException {
690        this(conf, managed, null, null);
691      }
692 
693     /**
694      * constructor
695      * @param conf Configuration object
696      * @param managed If true, does not do full shutdown on close; i.e. cleanup of connection
697      * to zk and shutdown of all services; we just close down the resources this connection was
698      * responsible for and decrement usage counters.  It is up to the caller to do the full
699      * cleanup.  It is set when we want have connection sharing going on -- reuse of zk connection,
700      * and cached region locations, established regionserver connections, etc.  When connections
701      * are shared, we have reference counting going on and will only do full cleanup when no more
702      * users of an HConnectionImplementation instance.
703      */
704     HConnectionImplementation(Configuration conf, boolean managed,
705         ExecutorService pool, User user) throws IOException {
706       this(conf);
707       this.user = user;
708       this.batchPool = pool;
709       this.managed = managed;
710       this.registry = setupRegistry();
711       retrieveClusterId();
712 
713       this.rpcClient = new RpcClient(this.conf, this.clusterId, this.metrics);
714 
715       // Do we publish the status?
716       boolean shouldListen = conf.getBoolean(HConstants.STATUS_PUBLISHED,
717           HConstants.STATUS_PUBLISHED_DEFAULT);
718       Class<? extends ClusterStatusListener.Listener> listenerClass =
719           conf.getClass(ClusterStatusListener.STATUS_LISTENER_CLASS,
720               ClusterStatusListener.DEFAULT_STATUS_LISTENER_CLASS,
721               ClusterStatusListener.Listener.class);
722       if (shouldListen) {
723         if (listenerClass == null) {
724           LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
725               ClusterStatusListener.STATUS_LISTENER_CLASS + " is not set - not listening status");
726         } else {
727           clusterStatusListener = new ClusterStatusListener(
728               new ClusterStatusListener.DeadServerHandler() {
729                 @Override
730                 public void newDead(ServerName sn) {
731                   clearCaches(sn);
732                   rpcClient.cancelConnections(sn.getHostname(), sn.getPort(),
733                       new SocketException(sn.getServerName() +
734                           " is dead: closing its connection."));
735                 }
736               }, conf, listenerClass);
737         }
738       }
739 
740       this.rpcCallerFactory = RpcRetryingCallerFactory.instantiate(conf, this.stats);
741       this.rpcControllerFactory = RpcControllerFactory.instantiate(conf);
742     }
743 
744     /** Dummy nonce generator for disabled nonces. */
745     private static class NoNonceGenerator implements NonceGenerator {
746       @Override
747       public long getNonceGroup() {
748         return HConstants.NO_NONCE;
749       }
750       @Override
751       public long newNonce() {
752         return HConstants.NO_NONCE;
753       }
754     }
755 
756     /**
757      * For tests.
758      */
759     protected HConnectionImplementation(Configuration conf) {
760       this.conf = conf;
761       this.tableConfig = new TableConfiguration(conf);
762       this.closed = false;
763       this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
764           HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
765       this.numTries = tableConfig.getRetriesNumber();
766       this.rpcTimeout = conf.getInt(
767           HConstants.HBASE_RPC_TIMEOUT_KEY,
768           HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
769       if (conf.getBoolean(CLIENT_NONCES_ENABLED_KEY, true)) {
770         synchronized (HConnectionManager.nonceGeneratorCreateLock) {
771           if (HConnectionManager.nonceGenerator == null) {
772             HConnectionManager.nonceGenerator = new PerClientRandomNonceGenerator();
773           }
774           this.nonceGenerator = HConnectionManager.nonceGenerator;
775         }
776       } else {
777         this.nonceGenerator = new NoNonceGenerator();
778       }
779 
780       this.stats = ServerStatisticTracker.create(conf);
781       this.usePrefetch = conf.getBoolean(HConstants.HBASE_CLIENT_PREFETCH,
782           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH);
783       this.prefetchRegionLimit = conf.getInt(
784           HConstants.HBASE_CLIENT_PREFETCH_LIMIT,
785           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH_LIMIT);
786       this.rpcControllerFactory = RpcControllerFactory.instantiate(conf);
787       this.rpcCallerFactory = RpcRetryingCallerFactory.instantiate(conf, this.stats);
788       this.backoffPolicy = ClientBackoffPolicyFactory.create(conf);
789       if (conf.getBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, false)) {
790         this.metrics = new MetricsConnection(this);
791       } else {
792         this.metrics = null;
793       }
794     }
795 
796     @Override
797     public HTableInterface getTable(String tableName) throws IOException {
798       return getTable(TableName.valueOf(tableName));
799     }
800 
801     @Override
802     public HTableInterface getTable(byte[] tableName) throws IOException {
803       return getTable(TableName.valueOf(tableName));
804     }
805 
806     @Override
807     public HTableInterface getTable(TableName tableName) throws IOException {
808       return getTable(tableName, getBatchPool());
809     }
810 
811     @Override
812     public HTableInterface getTable(String tableName, ExecutorService pool) throws IOException {
813       return getTable(TableName.valueOf(tableName), pool);
814     }
815 
816     @Override
817     public HTableInterface getTable(byte[] tableName, ExecutorService pool) throws IOException {
818       return getTable(TableName.valueOf(tableName), pool);
819     }
820 
821     @Override
822     public HTableInterface getTable(TableName tableName, ExecutorService pool) throws IOException {
823       if (managed) {
824         throw new IOException("The connection has to be unmanaged.");
825       }
826       return new HTable(tableName, this, tableConfig, rpcCallerFactory, rpcControllerFactory,
827         pool);
828     }
829 
830     @Override
831     public MetricsConnection getConnectionMetrics() {
832       return this.metrics;
833     }
834 
835     private ExecutorService getBatchPool() {
836       if (batchPool == null) {
837         // shared HTable thread executor not yet initialized
838         synchronized (this) {
839           if (batchPool == null) {
840             int maxThreads = conf.getInt("hbase.hconnection.threads.max", 256);
841             int coreThreads = conf.getInt("hbase.hconnection.threads.core", 256);
842             if (maxThreads == 0) {
843               maxThreads = Runtime.getRuntime().availableProcessors() * 8;
844             }
845             if (coreThreads == 0) {
846               coreThreads = Runtime.getRuntime().availableProcessors() * 8;
847             }
848             long keepAliveTime = conf.getLong("hbase.hconnection.threads.keepalivetime", 60);
849             LinkedBlockingQueue<Runnable> workQueue =
850               new LinkedBlockingQueue<Runnable>(maxThreads *
851                 conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
852                   HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
853             ThreadPoolExecutor tpe = new ThreadPoolExecutor(
854                 coreThreads,
855                 maxThreads,
856                 keepAliveTime,
857                 TimeUnit.SECONDS,
858                 workQueue,
859                 Threads.newDaemonThreadFactory(toString() + "-shared-"));
860             tpe.allowCoreThreadTimeOut(true);
861             this.batchPool = tpe;
862           }
863           this.cleanupPool = true;
864         }
865       }
866       return this.batchPool;
867     }
868 
869     protected ExecutorService getCurrentBatchPool() {
870       return batchPool;
871     }
872 
873     private void shutdownBatchPool() {
874       if (this.cleanupPool && this.batchPool != null && !this.batchPool.isShutdown()) {
875         this.batchPool.shutdown();
876         try {
877           if (!this.batchPool.awaitTermination(10, TimeUnit.SECONDS)) {
878             this.batchPool.shutdownNow();
879           }
880         } catch (InterruptedException e) {
881           this.batchPool.shutdownNow();
882         }
883       }
884     }
885 
886     /**
887      * @return The cluster registry implementation to use.
888      * @throws IOException
889      */
890     private Registry setupRegistry() throws IOException {
891       String registryClass = this.conf.get("hbase.client.registry.impl",
892         ZooKeeperRegistry.class.getName());
893       Registry registry = null;
894       try {
895         registry = (Registry)Class.forName(registryClass).newInstance();
896       } catch (Throwable t) {
897         throw new IOException(t);
898       }
899       registry.init(this);
900       return registry;
901     }
902 
903     /**
904      * For tests only.
905      * @param rpcClient Client we should use instead.
906      * @return Previous rpcClient
907      */
908     RpcClient setRpcClient(final RpcClient rpcClient) {
909       RpcClient oldRpcClient = this.rpcClient;
910       this.rpcClient = rpcClient;
911       return oldRpcClient;
912     }
913 
914     /**
915      * An identifier that will remain the same for a given connection.
916      * @return
917      */
918     public String toString(){
919       return "hconnection-0x" + Integer.toHexString(hashCode());
920     }
921 
922     protected String clusterId = null;
923 
924     void retrieveClusterId() {
925       if (clusterId != null) return;
926       this.clusterId = this.registry.getClusterId();
927       if (clusterId == null) {
928         clusterId = HConstants.CLUSTER_ID_DEFAULT;
929         LOG.debug("clusterid came back null, using default " + clusterId);
930       }
931     }
932 
933     @Override
934     public Configuration getConfiguration() {
935       return this.conf;
936     }
937 
938     private void checkIfBaseNodeAvailable(ZooKeeperWatcher zkw)
939       throws MasterNotRunningException {
940       String errorMsg;
941       try {
942         if (ZKUtil.checkExists(zkw, zkw.baseZNode) == -1) {
943           errorMsg = "The node " + zkw.baseZNode+" is not in ZooKeeper. "
944             + "It should have been written by the master. "
945             + "Check the value configured in 'zookeeper.znode.parent'. "
946             + "There could be a mismatch with the one configured in the master.";
947           LOG.error(errorMsg);
948           throw new MasterNotRunningException(errorMsg);
949         }
950       } catch (KeeperException e) {
951         errorMsg = "Can't get connection to ZooKeeper: " + e.getMessage();
952         LOG.error(errorMsg);
953         throw new MasterNotRunningException(errorMsg, e);
954       }
955     }
956 
957     /**
958      * @return true if the master is running, throws an exception otherwise
959      * @throws MasterNotRunningException - if the master is not running
960      * @throws ZooKeeperConnectionException
961      */
962     @Override
963     public boolean isMasterRunning()
964     throws MasterNotRunningException, ZooKeeperConnectionException {
965       // When getting the master connection, we check it's running,
966       // so if there is no exception, it means we've been able to get a
967       // connection on a running master
968       MasterKeepAliveConnection m = getKeepAliveMasterService();
969       m.close();
970       return true;
971     }
972 
973     @Override
974     public HRegionLocation getRegionLocation(final TableName tableName,
975         final byte [] row, boolean reload)
976     throws IOException {
977       return reload? relocateRegion(tableName, row): locateRegion(tableName, row);
978     }
979 
980     @Override
981     public HRegionLocation getRegionLocation(final byte[] tableName,
982         final byte [] row, boolean reload)
983     throws IOException {
984       return getRegionLocation(TableName.valueOf(tableName), row, reload);
985     }
986 
987     @Override
988     public boolean isTableEnabled(TableName tableName) throws IOException {
989       return this.registry.isTableOnlineState(tableName, true);
990     }
991 
992     @Override
993     public boolean isTableEnabled(byte[] tableName) throws IOException {
994       return isTableEnabled(TableName.valueOf(tableName));
995     }
996 
997     @Override
998     public boolean isTableDisabled(TableName tableName) throws IOException {
999       return this.registry.isTableOnlineState(tableName, false);
1000     }
1001 
1002     @Override
1003     public boolean isTableDisabled(byte[] tableName) throws IOException {
1004       return isTableDisabled(TableName.valueOf(tableName));
1005     }
1006 
1007     @Override
1008     public boolean isTableAvailable(final TableName tableName) throws IOException {
1009       final AtomicBoolean available = new AtomicBoolean(true);
1010       final AtomicInteger regionCount = new AtomicInteger(0);
1011       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1012         @Override
1013         public boolean processRow(Result row) throws IOException {
1014           HRegionInfo info = MetaScanner.getHRegionInfo(row);
1015           if (info != null && !info.isSplitParent()) {
1016             if (tableName.equals(info.getTable())) {
1017               ServerName server = HRegionInfo.getServerName(row);
1018               if (server == null) {
1019                 available.set(false);
1020                 return false;
1021               }
1022               regionCount.incrementAndGet();
1023             } else if (tableName.compareTo(info.getTable()) < 0) {
1024               // Return if we are done with the current table
1025               return false;
1026             }
1027           }
1028           return true;
1029         }
1030       };
1031       MetaScanner.metaScan(conf, this, visitor, tableName);
1032       return available.get() && (regionCount.get() > 0);
1033     }
1034 
1035     @Override
1036     public boolean isTableAvailable(final byte[] tableName) throws IOException {
1037       return isTableAvailable(TableName.valueOf(tableName));
1038     }
1039 
1040     @Override
1041     public boolean isTableAvailable(final TableName tableName, final byte[][] splitKeys)
1042         throws IOException {
1043       final AtomicBoolean available = new AtomicBoolean(true);
1044       final AtomicInteger regionCount = new AtomicInteger(0);
1045       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1046         @Override
1047         public boolean processRow(Result row) throws IOException {
1048           HRegionInfo info = MetaScanner.getHRegionInfo(row);
1049           if (info != null && !info.isSplitParent()) {
1050             if (tableName.equals(info.getTable())) {
1051               ServerName server = HRegionInfo.getServerName(row);
1052               if (server == null) {
1053                 available.set(false);
1054                 return false;
1055               }
1056               if (!Bytes.equals(info.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
1057                 for (byte[] splitKey : splitKeys) {
1058                   // Just check if the splitkey is available
1059                   if (Bytes.equals(info.getStartKey(), splitKey)) {
1060                     regionCount.incrementAndGet();
1061                     break;
1062                   }
1063                 }
1064               } else {
1065                 // Always empty start row should be counted
1066                 regionCount.incrementAndGet();
1067               }
1068             } else if (tableName.compareTo(info.getTable()) < 0) {
1069               // Return if we are done with the current table
1070               return false;
1071             }
1072           }
1073           return true;
1074         }
1075       };
1076       MetaScanner.metaScan(conf, this, visitor, tableName);
1077       // +1 needs to be added so that the empty start row is also taken into account
1078       return available.get() && (regionCount.get() == splitKeys.length + 1);
1079     }
1080 
1081     @Override
1082     public boolean isTableAvailable(final byte[] tableName, final byte[][] splitKeys)
1083         throws IOException {
1084       return isTableAvailable(TableName.valueOf(tableName), splitKeys);
1085     }
1086 
1087     @Override
1088     public HRegionLocation locateRegion(final byte[] regionName) throws IOException {
1089       return locateRegion(HRegionInfo.getTable(regionName),
1090           HRegionInfo.getStartKey(regionName), false, true);
1091     }
1092 
1093     @Override
1094     public boolean isDeadServer(ServerName sn) {
1095       if (clusterStatusListener == null) {
1096         return false;
1097       } else {
1098         return clusterStatusListener.isDeadServer(sn);
1099       }
1100     }
1101 
1102     @Override
1103     public List<HRegionLocation> locateRegions(final TableName tableName)
1104     throws IOException {
1105       return locateRegions (tableName, false, true);
1106     }
1107 
1108     @Override
1109     public List<HRegionLocation> locateRegions(final byte[] tableName)
1110     throws IOException {
1111       return locateRegions(TableName.valueOf(tableName));
1112     }
1113 
1114     @Override
1115     public List<HRegionLocation> locateRegions(final TableName tableName,
1116         final boolean useCache, final boolean offlined) throws IOException {
1117       NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(conf, this,
1118           tableName, offlined);
1119       final List<HRegionLocation> locations = new ArrayList<HRegionLocation>();
1120       for (HRegionInfo regionInfo : regions.keySet()) {
1121         locations.add(locateRegion(tableName, regionInfo.getStartKey(), useCache, true));
1122       }
1123       return locations;
1124     }
1125 
1126     @Override
1127     public List<HRegionLocation> locateRegions(final byte[] tableName,
1128        final boolean useCache, final boolean offlined) throws IOException {
1129       return locateRegions(TableName.valueOf(tableName), useCache, offlined);
1130     }
1131 
1132     @Override
1133     public HRegionLocation locateRegion(final TableName tableName,
1134         final byte [] row)
1135     throws IOException{
1136       return locateRegion(tableName, row, true, true);
1137     }
1138 
1139     @Override
1140     public HRegionLocation locateRegion(final byte[] tableName,
1141         final byte [] row)
1142     throws IOException{
1143       return locateRegion(TableName.valueOf(tableName), row);
1144     }
1145 
1146     @Override
1147     public HRegionLocation relocateRegion(final TableName tableName,
1148         final byte [] row) throws IOException{
1149       // Since this is an explicit request not to use any caching, finding
1150       // disabled tables should not be desirable.  This will ensure that an exception is thrown when
1151       // the first time a disabled table is interacted with.
1152       if (isTableDisabled(tableName)) {
1153         throw new TableNotEnabledException(tableName.getNameAsString() + " is disabled.");
1154       }
1155 
1156       return locateRegion(tableName, row, false, true);
1157     }
1158 
1159     @Override
1160     public HRegionLocation relocateRegion(final byte[] tableName,
1161         final byte [] row) throws IOException {
1162       return relocateRegion(TableName.valueOf(tableName), row);
1163     }
1164 
1165 
1166     private HRegionLocation locateRegion(final TableName tableName,
1167       final byte [] row, boolean useCache, boolean retry)
1168     throws IOException {
1169       if (this.closed) throw new IOException(toString() + " closed");
1170       if (tableName== null || tableName.getName().length == 0) {
1171         throw new IllegalArgumentException(
1172             "table name cannot be null or zero length");
1173       }
1174 
1175       if (tableName.equals(TableName.META_TABLE_NAME)) {
1176         return locateMeta(tableName, useCache);
1177       } else {
1178         // Region not in the cache - have to go to the meta RS
1179         return locateRegionInMeta(TableName.META_TABLE_NAME, tableName, row,
1180           useCache, userRegionLock, retry);
1181       }
1182     }
1183 
1184     private HRegionLocation locateMeta(final TableName tableName,
1185         boolean useCache) throws IOException {
1186       // HBASE-10785: We cache the location of the META itself, so that we are not overloading
1187       // zookeeper with one request for every region lookup. We cache the META with empty row
1188       // key in MetaCache.
1189       byte[] metaCacheKey = HConstants.EMPTY_START_ROW; // use byte[0] as the row for meta
1190       HRegionLocation location = null;
1191       if (useCache) {
1192         location = getCachedLocation(tableName, metaCacheKey);
1193         if (location != null) {
1194           return location;
1195         }
1196       }
1197 
1198       // only one thread should do the lookup.
1199       synchronized (metaRegionLock) {
1200         // Check the cache again for a hit in case some other thread made the
1201         // same query while we were waiting on the lock.
1202         if (useCache) {
1203           location = getCachedLocation(tableName, metaCacheKey);
1204           if (location != null) {
1205             return location;
1206           }
1207         }
1208 
1209         // Look up from zookeeper
1210         location = this.registry.getMetaRegionLocation();
1211         if (location != null) {
1212           cacheLocation(tableName, null, location);
1213         }
1214       }
1215       return location;
1216     }
1217 
1218     /*
1219      * Search hbase:meta for the HRegionLocation info that contains the table and
1220      * row we're seeking. It will prefetch certain number of regions info and
1221      * save them to the global region cache.
1222      */
1223     private void prefetchRegionCache(final TableName tableName,
1224         final byte[] row) {
1225       // Implement a new visitor for MetaScanner, and use it to walk through
1226       // the hbase:meta
1227       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1228         public boolean processRow(Result result) throws IOException {
1229           try {
1230             HRegionInfo regionInfo = MetaScanner.getHRegionInfo(result);
1231             if (regionInfo == null) {
1232               return true;
1233             }
1234 
1235             // possible we got a region of a different table...
1236             if (!regionInfo.getTable().equals(tableName)) {
1237               return false; // stop scanning
1238             }
1239             if (regionInfo.isOffline()) {
1240               // don't cache offline regions
1241               return true;
1242             }
1243 
1244             ServerName serverName = HRegionInfo.getServerName(result);
1245             if (serverName == null) {
1246               return true; // don't cache it
1247             }
1248             // instantiate the location
1249             long seqNum = HRegionInfo.getSeqNumDuringOpen(result);
1250             HRegionLocation loc = new HRegionLocation(regionInfo, serverName, seqNum);
1251             // cache this meta entry
1252             cacheLocation(tableName, null, loc);
1253             return true;
1254           } catch (RuntimeException e) {
1255             throw new IOException(e);
1256           }
1257         }
1258       };
1259       try {
1260         // pre-fetch certain number of regions info at region cache.
1261         MetaScanner.metaScan(conf, this, visitor, tableName, row,
1262             this.prefetchRegionLimit, TableName.META_TABLE_NAME);
1263       } catch (IOException e) {
1264         if (ExceptionUtil.isInterrupt(e)) {
1265           Thread.currentThread().interrupt();
1266         }
1267       }
1268     }
1269 
1270     /*
1271       * Search the hbase:meta table for the HRegionLocation
1272       * info that contains the table and row we're seeking.
1273       */
1274     private HRegionLocation locateRegionInMeta(final TableName parentTable,
1275       final TableName tableName, final byte [] row, boolean useCache,
1276       Object regionLockObject, boolean retry)
1277     throws IOException {
1278       HRegionLocation location;
1279       // If we are supposed to be using the cache, look in the cache to see if
1280       // we already have the region.
1281       if (useCache) {
1282         location = getCachedLocation(tableName, row);
1283         if (location != null) {
1284           return location;
1285         }
1286       }
1287       int localNumRetries = retry ? numTries : 1;
1288       // build the key of the meta region we should be looking for.
1289       // the extra 9's on the end are necessary to allow "exact" matches
1290       // without knowing the precise region names.
1291       byte [] metaKey = HRegionInfo.createRegionName(tableName, row,
1292         HConstants.NINES, false);
1293       for (int tries = 0; true; tries++) {
1294         if (tries >= localNumRetries) {
1295           throw new NoServerForRegionException("Unable to find region for "
1296             + Bytes.toStringBinary(row) + " after " + numTries + " tries.");
1297         }
1298 
1299         HRegionLocation metaLocation = null;
1300         try {
1301           // locate the meta region
1302           metaLocation = locateRegion(parentTable, metaKey, true, false);
1303           // If null still, go around again.
1304           if (metaLocation == null) continue;
1305           ClientService.BlockingInterface service = getClient(metaLocation.getServerName());
1306 
1307           Result regionInfoRow;
1308           // This block guards against two threads trying to load the meta
1309           // region at the same time. The first will load the meta region and
1310           // the second will use the value that the first one found.
1311           if (useCache) {
1312             if (TableName.META_TABLE_NAME.equals(parentTable) && usePrefetch &&
1313                 getRegionCachePrefetch(tableName)) {
1314               synchronized (regionLockObject) {
1315                 // Check the cache again for a hit in case some other thread made the
1316                 // same query while we were waiting on the lock.
1317                 location = getCachedLocation(tableName, row);
1318                 if (location != null) {
1319                   return location;
1320                 }
1321                 // If the parent table is META, we may want to pre-fetch some
1322                 // region info into the global region cache for this table.
1323                 prefetchRegionCache(tableName, row);
1324               }
1325             }
1326             location = getCachedLocation(tableName, row);
1327             if (location != null) {
1328               return location;
1329             }
1330           } else {
1331             // If we are not supposed to be using the cache, delete any existing cached location
1332             // so it won't interfere.
1333             forceDeleteCachedLocation(tableName, row);
1334           }
1335 
1336           // Query the meta region for the location of the meta region
1337           regionInfoRow =
1338               ProtobufUtil.getRowOrBefore(service, metaLocation.getRegionInfo().getRegionName(),
1339                 metaKey, HConstants.CATALOG_FAMILY);
1340 
1341           if (regionInfoRow == null) {
1342             throw new TableNotFoundException(tableName);
1343           }
1344 
1345           // convert the row result into the HRegionLocation we need!
1346           HRegionInfo regionInfo = MetaScanner.getHRegionInfo(regionInfoRow);
1347           if (regionInfo == null) {
1348             throw new IOException("HRegionInfo was null or empty in " +
1349               parentTable + ", row=" + regionInfoRow);
1350           }
1351 
1352           // possible we got a region of a different table...
1353           if (!regionInfo.getTable().equals(tableName)) {
1354             throw new TableNotFoundException(
1355                   "Table '" + tableName + "' was not found, got: " +
1356                   regionInfo.getTable() + ".");
1357           }
1358           if (regionInfo.isSplit()) {
1359             throw new RegionOfflineException("the only available region for" +
1360               " the required row is a split parent," +
1361               " the daughters should be online soon: " +
1362               regionInfo.getRegionNameAsString());
1363           }
1364           if (regionInfo.isOffline()) {
1365             throw new RegionOfflineException("the region is offline, could" +
1366               " be caused by a disable table call: " +
1367               regionInfo.getRegionNameAsString());
1368           }
1369 
1370           ServerName serverName = HRegionInfo.getServerName(regionInfoRow);
1371           if (serverName == null) {
1372             throw new NoServerForRegionException("No server address listed " +
1373               "in " + parentTable + " for region " +
1374               regionInfo.getRegionNameAsString() + " containing row " +
1375               Bytes.toStringBinary(row));
1376           }
1377 
1378           if (isDeadServer(serverName)){
1379             throw new RegionServerStoppedException("hbase:meta says the region "+
1380                 regionInfo.getRegionNameAsString()+" is managed by the server " + serverName +
1381                 ", but it is dead.");
1382           }
1383 
1384           // Instantiate the location
1385           location = new HRegionLocation(regionInfo, serverName,
1386             HRegionInfo.getSeqNumDuringOpen(regionInfoRow));
1387           cacheLocation(tableName, null, location);
1388           return location;
1389         } catch (TableNotFoundException e) {
1390           // if we got this error, probably means the table just plain doesn't
1391           // exist. rethrow the error immediately. this should always be coming
1392           // from the HTable constructor.
1393           throw e;
1394         } catch (IOException e) {
1395           ExceptionUtil.rethrowIfInterrupt(e);
1396 
1397           if (e instanceof RemoteException) {
1398             e = ((RemoteException)e).unwrapRemoteException();
1399           }
1400           if (tries < numTries - 1) {
1401             if (LOG.isDebugEnabled()) {
1402               LOG.debug("locateRegionInMeta parentTable=" +
1403                 parentTable + ", metaLocation=" +
1404                 ((metaLocation == null)? "null": "{" + metaLocation + "}") +
1405                 ", attempt=" + tries + " of " +
1406                 this.numTries + " failed; retrying after sleep of " +
1407                 ConnectionUtils.getPauseTime(this.pause, tries) + " because: " + e.getMessage());
1408             }
1409           } else {
1410             throw e;
1411           }
1412           // Only relocate the parent region if necessary
1413           if(!(e instanceof RegionOfflineException ||
1414               e instanceof NoServerForRegionException)) {
1415             relocateRegion(parentTable, metaKey);
1416           }
1417         }
1418         try{
1419           Thread.sleep(ConnectionUtils.getPauseTime(this.pause, tries));
1420         } catch (InterruptedException e) {
1421           throw new InterruptedIOException("Giving up trying to location region in " +
1422             "meta: thread is interrupted.");
1423         }
1424       }
1425     }
1426 
1427     /*
1428      * Search the cache for a location that fits our table and row key.
1429      * Return null if no suitable region is located.
1430      *
1431      * @param tableName
1432      * @param row
1433      * @return Null or region location found in cache.
1434      */
1435     HRegionLocation getCachedLocation(final TableName tableName,
1436         final byte [] row) {
1437       ConcurrentSkipListMap<byte[], HRegionLocation> tableLocations =
1438         getTableLocations(tableName);
1439 
1440       Entry<byte[], HRegionLocation> e = tableLocations.floorEntry(row);
1441       if (e == null) {
1442         if (metrics != null) metrics.incrMetaCacheMiss();
1443         return null;
1444       }
1445       HRegionLocation possibleRegion = e.getValue();
1446 
1447       // make sure that the end key is greater than the row we're looking
1448       // for, otherwise the row actually belongs in the next region, not
1449       // this one. the exception case is when the endkey is
1450       // HConstants.EMPTY_END_ROW, signifying that the region we're
1451       // checking is actually the last region in the table.
1452       byte[] endKey = possibleRegion.getRegionInfo().getEndKey();
1453       if (Bytes.equals(endKey, HConstants.EMPTY_END_ROW) ||
1454           tableName.getRowComparator().compareRows(
1455               endKey, 0, endKey.length, row, 0, row.length) > 0) {
1456         if (metrics != null) metrics.incrMetaCacheHit();
1457         return possibleRegion;
1458       }
1459 
1460       // Passed all the way through, so we got nothing - complete cache miss
1461       if (metrics != null) metrics.incrMetaCacheMiss();
1462       return null;
1463     }
1464 
1465     /**
1466      * Delete a cached location, no matter what it is. Called when we were told to not use cache.
1467      * @param tableName tableName
1468      * @param row
1469      */
1470     void forceDeleteCachedLocation(final TableName tableName, final byte [] row) {
1471       HRegionLocation rl = null;
1472       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1473       // start to examine the cache. we can only do cache actions
1474       // if there's something in the cache for this table.
1475       rl = getCachedLocation(tableName, row);
1476       if (rl != null) {
1477         tableLocations.remove(rl.getRegionInfo().getStartKey());
1478       }
1479       if ((rl != null) && LOG.isDebugEnabled()) {
1480         LOG.debug("Removed " + rl.getHostname() + ":" + rl.getPort()
1481           + " as a location of " + rl.getRegionInfo().getRegionNameAsString() +
1482           " for tableName=" + tableName + " from cache");
1483       }
1484     }
1485 
1486     /*
1487      * Delete all cached entries of a table that maps to a specific location.
1488      */
1489     @Override
1490     public void clearCaches(final ServerName serverName) {
1491       if (!this.cachedServers.contains(serverName)) {
1492         return;
1493       }
1494 
1495       boolean deletedSomething = false;
1496       synchronized (this.cachedServers) {
1497         // We block here, because if there is an error on a server, it's likely that multiple
1498         //  threads will get the error  simultaneously. If there are hundreds of thousand of
1499         //  region location to check, it's better to do this only once. A better pattern would
1500         //  be to check if the server is dead when we get the region location.
1501         if (!this.cachedServers.contains(serverName)) {
1502           return;
1503         }
1504         for (Map<byte[], HRegionLocation> tableLocations : cachedRegionLocations.values()) {
1505           for (Entry<byte[], HRegionLocation> e : tableLocations.entrySet()) {
1506             HRegionLocation value = e.getValue();
1507             if (value != null
1508                 && serverName.equals(value.getServerName())) {
1509               tableLocations.remove(e.getKey());
1510               deletedSomething = true;
1511             }
1512           }
1513         }
1514         this.cachedServers.remove(serverName);
1515       }
1516       if (deletedSomething && LOG.isDebugEnabled()) {
1517         LOG.debug("Removed all cached region locations that map to " + serverName);
1518       }
1519     }
1520 
1521     /*
1522      * @param tableName
1523      * @return Map of cached locations for passed <code>tableName</code>
1524      */
1525     private ConcurrentSkipListMap<byte[], HRegionLocation> getTableLocations(
1526         final TableName tableName) {
1527       // find the map of cached locations for this table
1528       ConcurrentSkipListMap<byte[], HRegionLocation> result;
1529       result = this.cachedRegionLocations.get(tableName);
1530       // if tableLocations for this table isn't built yet, make one
1531       if (result == null) {
1532         result = new ConcurrentSkipListMap<byte[], HRegionLocation>(Bytes.BYTES_COMPARATOR);
1533         ConcurrentSkipListMap<byte[], HRegionLocation> old =
1534             this.cachedRegionLocations.putIfAbsent(tableName, result);
1535         if (old != null) {
1536           return old;
1537         }
1538       }
1539       return result;
1540     }
1541 
1542     @Override
1543     public void clearRegionCache() {
1544       this.cachedRegionLocations.clear();
1545       this.cachedServers.clear();
1546     }
1547 
1548     @Override
1549     public void clearRegionCache(final TableName tableName) {
1550       this.cachedRegionLocations.remove(tableName);
1551     }
1552 
1553     @Override
1554     public void clearRegionCache(final byte[] tableName) {
1555       clearRegionCache(TableName.valueOf(tableName));
1556     }
1557 
1558     /**
1559      * Put a newly discovered HRegionLocation into the cache.
1560      * @param tableName The table name.
1561      * @param source the source of the new location, if it's not coming from meta
1562      * @param location the new location
1563      */
1564     private void cacheLocation(final TableName tableName, final HRegionLocation source,
1565         final HRegionLocation location) {
1566       boolean isFromMeta = (source == null);
1567       byte [] startKey = location.getRegionInfo().getStartKey();
1568       ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1569       HRegionLocation oldLocation = tableLocations.putIfAbsent(startKey, location);
1570       boolean isNewCacheEntry = (oldLocation == null);
1571       if (isNewCacheEntry) {
1572         cachedServers.add(location.getServerName());
1573         return;
1574       }
1575       boolean updateCache;
1576       // If the server in cache sends us a redirect, assume it's always valid.
1577       if (oldLocation.equals(source)) {
1578         updateCache = true;
1579       } else {
1580         long newLocationSeqNum = location.getSeqNum();
1581         // Meta record is stale - some (probably the same) server has closed the region
1582         // with later seqNum and told us about the new location.
1583         boolean isStaleMetaRecord = isFromMeta && (oldLocation.getSeqNum() > newLocationSeqNum);
1584         // Same as above for redirect. However, in this case, if the number is equal to previous
1585         // record, the most common case is that first the region was closed with seqNum, and then
1586         // opened with the same seqNum; hence we will ignore the redirect.
1587         // There are so many corner cases with various combinations of opens and closes that
1588         // an additional counter on top of seqNum would be necessary to handle them all.
1589         boolean isStaleRedirect = !isFromMeta && (oldLocation.getSeqNum() >= newLocationSeqNum);
1590         boolean isStaleUpdate = (isStaleMetaRecord || isStaleRedirect);
1591         updateCache = (!isStaleUpdate);
1592       }
1593       if (updateCache) {
1594         tableLocations.replace(startKey, oldLocation, location);
1595         cachedServers.add(location.getServerName());
1596       }
1597     }
1598 
1599     // Map keyed by service name + regionserver to service stub implementation
1600     private final ConcurrentHashMap<String, Object> stubs =
1601       new ConcurrentHashMap<String, Object>();
1602     // Map of locks used creating service stubs per regionserver.
1603     private final ConcurrentHashMap<String, String> connectionLock =
1604       new ConcurrentHashMap<String, String>();
1605 
1606     /**
1607      * State of the MasterService connection/setup.
1608      */
1609     static class MasterServiceState {
1610       HConnection connection;
1611       MasterService.BlockingInterface stub;
1612       int userCount;
1613       long keepAliveUntil = Long.MAX_VALUE;
1614 
1615       MasterServiceState (final HConnection connection) {
1616         super();
1617         this.connection = connection;
1618       }
1619 
1620       @Override
1621       public String toString() {
1622         return "MasterService";
1623       }
1624 
1625       Object getStub() {
1626         return this.stub;
1627       }
1628 
1629       void clearStub() {
1630         this.stub = null;
1631       }
1632 
1633       boolean isMasterRunning() throws ServiceException {
1634         IsMasterRunningResponse response =
1635           this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1636         return response != null? response.getIsMasterRunning(): false;
1637       }
1638     }
1639 
1640     /**
1641      * Makes a client-side stub for master services. Sub-class to specialize.
1642      * Depends on hosting class so not static.  Exists so we avoid duplicating a bunch of code
1643      * when setting up the MasterMonitorService and MasterAdminService.
1644      */
1645     abstract class StubMaker {
1646       /**
1647        * Returns the name of the service stub being created.
1648        */
1649       protected abstract String getServiceName();
1650 
1651       /**
1652        * Make stub and cache it internal so can be used later doing the isMasterRunning call.
1653        * @param channel
1654        */
1655       protected abstract Object makeStub(final BlockingRpcChannel channel);
1656 
1657       /**
1658        * Once setup, check it works by doing isMasterRunning check.
1659        * @throws ServiceException
1660        */
1661       protected abstract void isMasterRunning() throws ServiceException;
1662 
1663       /**
1664        * Create a stub. Try once only.  It is not typed because there is no common type to
1665        * protobuf services nor their interfaces.  Let the caller do appropriate casting.
1666        * @return A stub for master services.
1667        * @throws IOException
1668        * @throws KeeperException
1669        * @throws ServiceException
1670        */
1671       private Object makeStubNoRetries() throws IOException, KeeperException, ServiceException {
1672         ZooKeeperKeepAliveConnection zkw;
1673         try {
1674           zkw = getKeepAliveZooKeeperWatcher();
1675         } catch (IOException e) {
1676           ExceptionUtil.rethrowIfInterrupt(e);
1677           throw new ZooKeeperConnectionException("Can't connect to ZooKeeper", e);
1678         }
1679         try {
1680           checkIfBaseNodeAvailable(zkw);
1681           ServerName sn = MasterAddressTracker.getMasterAddress(zkw);
1682           if (sn == null) {
1683             String msg = "ZooKeeper available but no active master location found";
1684             LOG.info(msg);
1685             throw new MasterNotRunningException(msg);
1686           }
1687           if (isDeadServer(sn)) {
1688             throw new MasterNotRunningException(sn + " is dead.");
1689           }
1690           // Use the security info interface name as our stub key
1691           String key = getStubKey(getServiceName(), sn.getHostAndPort());
1692           connectionLock.putIfAbsent(key, key);
1693           Object stub = null;
1694           synchronized (connectionLock.get(key)) {
1695             stub = stubs.get(key);
1696             if (stub == null) {
1697               BlockingRpcChannel channel = rpcClient.createBlockingRpcChannel(sn,
1698                 user, rpcTimeout);
1699               stub = makeStub(channel);
1700               isMasterRunning();
1701               stubs.put(key, stub);
1702             }
1703           }
1704           return stub;
1705         } finally {
1706           zkw.close();
1707         }
1708       }
1709 
1710       /**
1711        * Create a stub against the master.  Retry if necessary.
1712        * @return A stub to do <code>intf</code> against the master
1713        * @throws MasterNotRunningException
1714        */
1715       @edu.umd.cs.findbugs.annotations.SuppressWarnings (value="SWL_SLEEP_WITH_LOCK_HELD")
1716       Object makeStub() throws MasterNotRunningException {
1717         // The lock must be at the beginning to prevent multiple master creations
1718         //  (and leaks) in a multithread context
1719         synchronized (masterAndZKLock) {
1720           Exception exceptionCaught = null;
1721           Object stub = null;
1722           int tries = 0;
1723           while (!closed && stub == null) {
1724             tries++;
1725             try {
1726               stub = makeStubNoRetries();
1727             } catch (IOException e) {
1728               exceptionCaught = e;
1729             } catch (KeeperException e) {
1730               exceptionCaught = e;
1731             } catch (ServiceException e) {
1732               exceptionCaught = e;
1733             }
1734 
1735             if (exceptionCaught != null)
1736               // It failed. If it's not the last try, we're going to wait a little
1737               if (tries < numTries && !ExceptionUtil.isInterrupt(exceptionCaught)) {
1738                 // tries at this point is 1 or more; decrement to start from 0.
1739                 long pauseTime = ConnectionUtils.getPauseTime(pause, tries - 1);
1740                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1741                     " failed; retrying after sleep of " + pauseTime + ", exception=" +
1742                   exceptionCaught);
1743 
1744                 try {
1745                   Thread.sleep(pauseTime);
1746                 } catch (InterruptedException e) {
1747                   throw new MasterNotRunningException(
1748                       "Thread was interrupted while trying to connect to master.", e);
1749                 }
1750               } else {
1751                 // Enough tries, we stop now
1752                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1753                     " failed; no more retrying.", exceptionCaught);
1754                 throw new MasterNotRunningException(exceptionCaught);
1755               }
1756           }
1757 
1758           if (stub == null) {
1759             // implies this.closed true
1760             throw new MasterNotRunningException("Connection was closed while trying to get master");
1761           }
1762           return stub;
1763         }
1764       }
1765     }
1766 
1767     /**
1768      * Class to make a MasterServiceStubMaker stub.
1769      */
1770     class MasterServiceStubMaker extends StubMaker {
1771       private MasterService.BlockingInterface stub;
1772       @Override
1773       protected String getServiceName() {
1774         return MasterService.getDescriptor().getName();
1775       }
1776 
1777       @Override
1778       @edu.umd.cs.findbugs.annotations.SuppressWarnings("SWL_SLEEP_WITH_LOCK_HELD")
1779       MasterService.BlockingInterface makeStub() throws MasterNotRunningException {
1780         return (MasterService.BlockingInterface)super.makeStub();
1781       }
1782 
1783       @Override
1784       protected Object makeStub(BlockingRpcChannel channel) {
1785         this.stub = MasterService.newBlockingStub(channel);
1786         return this.stub;
1787       }
1788 
1789       @Override
1790       protected void isMasterRunning() throws ServiceException {
1791         this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1792       }
1793     }
1794 
1795     @Override
1796     public AdminService.BlockingInterface getAdmin(final ServerName serverName)
1797         throws IOException {
1798       return getAdmin(serverName, false);
1799     }
1800 
1801     @Override
1802     // Nothing is done w/ the 'master' parameter.  It is ignored.
1803     public AdminService.BlockingInterface getAdmin(final ServerName serverName,
1804       final boolean master)
1805     throws IOException {
1806       if (isDeadServer(serverName)) {
1807         throw new RegionServerStoppedException(serverName + " is dead.");
1808       }
1809       String key = getStubKey(AdminService.BlockingInterface.class.getName(),
1810         serverName.getHostAndPort());
1811       this.connectionLock.putIfAbsent(key, key);
1812       AdminService.BlockingInterface stub = null;
1813       synchronized (this.connectionLock.get(key)) {
1814         stub = (AdminService.BlockingInterface)this.stubs.get(key);
1815         if (stub == null) {
1816           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(serverName,
1817             user, this.rpcTimeout);
1818           stub = AdminService.newBlockingStub(channel);
1819           this.stubs.put(key, stub);
1820         }
1821       }
1822       return stub;
1823     }
1824 
1825     @Override
1826     public ClientService.BlockingInterface getClient(final ServerName sn)
1827     throws IOException {
1828       if (isDeadServer(sn)) {
1829         throw new RegionServerStoppedException(sn + " is dead.");
1830       }
1831       String key = getStubKey(ClientService.BlockingInterface.class.getName(), sn.getHostAndPort());
1832       this.connectionLock.putIfAbsent(key, key);
1833       ClientService.BlockingInterface stub = null;
1834       synchronized (this.connectionLock.get(key)) {
1835         stub = (ClientService.BlockingInterface)this.stubs.get(key);
1836         if (stub == null) {
1837           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(sn,
1838             user, this.rpcTimeout);
1839           stub = ClientService.newBlockingStub(channel);
1840           // In old days, after getting stub/proxy, we'd make a call.  We are not doing that here.
1841           // Just fail on first actual call rather than in here on setup.
1842           this.stubs.put(key, stub);
1843         }
1844       }
1845       return stub;
1846     }
1847 
1848     static String getStubKey(final String serviceName, final String rsHostnamePort) {
1849       return serviceName + "@" + rsHostnamePort;
1850     }
1851 
1852     private ZooKeeperKeepAliveConnection keepAliveZookeeper;
1853     private AtomicInteger keepAliveZookeeperUserCount = new AtomicInteger(0);
1854     private boolean canCloseZKW = true;
1855 
1856     // keepAlive time, in ms. No reason to make it configurable.
1857     private static final long keepAlive = 5 * 60 * 1000;
1858 
1859     /**
1860      * Retrieve a shared ZooKeeperWatcher. You must close it it once you've have finished with it.
1861      * @return The shared instance. Never returns null.
1862      */
1863     ZooKeeperKeepAliveConnection getKeepAliveZooKeeperWatcher()
1864       throws IOException {
1865       synchronized (masterAndZKLock) {
1866         if (keepAliveZookeeper == null) {
1867           if (this.closed) {
1868             throw new IOException(toString() + " closed");
1869           }
1870           // We don't check that our link to ZooKeeper is still valid
1871           // But there is a retry mechanism in the ZooKeeperWatcher itself
1872           keepAliveZookeeper = new ZooKeeperKeepAliveConnection(conf, this.toString(), this);
1873         }
1874         keepAliveZookeeperUserCount.incrementAndGet();
1875         keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1876         return keepAliveZookeeper;
1877       }
1878     }
1879 
1880     void releaseZooKeeperWatcher(final ZooKeeperWatcher zkw) {
1881       if (zkw == null){
1882         return;
1883       }
1884       synchronized (masterAndZKLock) {
1885         if (keepAliveZookeeperUserCount.decrementAndGet() <= 0 ){
1886           keepZooKeeperWatcherAliveUntil = System.currentTimeMillis() + keepAlive;
1887         }
1888       }
1889     }
1890 
1891     /**
1892      * Creates a Chore thread to check the connections to master & zookeeper
1893      *  and close them when they reach their closing time (
1894      *  {@link MasterServiceState#keepAliveUntil} and
1895      *  {@link #keepZooKeeperWatcherAliveUntil}). Keep alive time is
1896      *  managed by the release functions and the variable {@link #keepAlive}
1897      */
1898     private static class DelayedClosing extends Chore implements Stoppable {
1899       private HConnectionImplementation hci;
1900       Stoppable stoppable;
1901 
1902       private DelayedClosing(
1903         HConnectionImplementation hci, Stoppable stoppable){
1904         super(
1905           "ZooKeeperWatcher and Master delayed closing for connection "+hci,
1906           60*1000, // We check every minutes
1907           stoppable);
1908         this.hci = hci;
1909         this.stoppable = stoppable;
1910       }
1911 
1912       static DelayedClosing createAndStart(HConnectionImplementation hci){
1913         Stoppable stoppable = new Stoppable() {
1914               private volatile boolean isStopped = false;
1915               @Override public void stop(String why) { isStopped = true;}
1916               @Override public boolean isStopped() {return isStopped;}
1917             };
1918 
1919         return new DelayedClosing(hci, stoppable);
1920       }
1921 
1922       protected void closeMasterProtocol(MasterServiceState protocolState) {
1923         if (System.currentTimeMillis() > protocolState.keepAliveUntil) {
1924           hci.closeMasterService(protocolState);
1925           protocolState.keepAliveUntil = Long.MAX_VALUE;
1926         }
1927       }
1928 
1929       @Override
1930       protected void chore() {
1931         synchronized (hci.masterAndZKLock) {
1932           if (hci.canCloseZKW) {
1933             if (System.currentTimeMillis() >
1934               hci.keepZooKeeperWatcherAliveUntil) {
1935 
1936               hci.closeZooKeeperWatcher();
1937               hci.keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1938             }
1939           }
1940           closeMasterProtocol(hci.masterServiceState);
1941           closeMasterProtocol(hci.masterServiceState);
1942         }
1943       }
1944 
1945       @Override
1946       public void stop(String why) {
1947         stoppable.stop(why);
1948       }
1949 
1950       @Override
1951       public boolean isStopped() {
1952         return stoppable.isStopped();
1953       }
1954     }
1955 
1956     private void closeZooKeeperWatcher() {
1957       synchronized (masterAndZKLock) {
1958         if (keepAliveZookeeper != null) {
1959           LOG.info("Closing zookeeper sessionid=0x" +
1960             Long.toHexString(
1961               keepAliveZookeeper.getRecoverableZooKeeper().getSessionId()));
1962           keepAliveZookeeper.internalClose();
1963           keepAliveZookeeper = null;
1964         }
1965         keepAliveZookeeperUserCount.set(0);
1966       }
1967     }
1968 
1969     final MasterServiceState masterServiceState = new MasterServiceState(this);
1970 
1971     @Override
1972     public MasterService.BlockingInterface getMaster() throws MasterNotRunningException {
1973       return getKeepAliveMasterService();
1974     }
1975 
1976     private void resetMasterServiceState(final MasterServiceState mss) {
1977       mss.userCount++;
1978       mss.keepAliveUntil = Long.MAX_VALUE;
1979     }
1980 
1981     @Override
1982     public MasterKeepAliveConnection getKeepAliveMasterService()
1983     throws MasterNotRunningException {
1984       synchronized (masterAndZKLock) {
1985         if (!isKeepAliveMasterConnectedAndRunning(this.masterServiceState)) {
1986           MasterServiceStubMaker stubMaker = new MasterServiceStubMaker();
1987           this.masterServiceState.stub = stubMaker.makeStub();
1988         }
1989         resetMasterServiceState(this.masterServiceState);
1990       }
1991       // Ugly delegation just so we can add in a Close method.
1992       final MasterService.BlockingInterface stub = this.masterServiceState.stub;
1993       return new MasterKeepAliveConnection() {
1994         MasterServiceState mss = masterServiceState;
1995         @Override
1996         public AddColumnResponse addColumn(RpcController controller, AddColumnRequest request)
1997         throws ServiceException {
1998           return stub.addColumn(controller, request);
1999         }
2000 
2001         @Override
2002         public DeleteColumnResponse deleteColumn(RpcController controller,
2003             DeleteColumnRequest request)
2004         throws ServiceException {
2005           return stub.deleteColumn(controller, request);
2006         }
2007 
2008         @Override
2009         public ModifyColumnResponse modifyColumn(RpcController controller,
2010             ModifyColumnRequest request)
2011         throws ServiceException {
2012           return stub.modifyColumn(controller, request);
2013         }
2014 
2015         @Override
2016         public MoveRegionResponse moveRegion(RpcController controller,
2017             MoveRegionRequest request) throws ServiceException {
2018           return stub.moveRegion(controller, request);
2019         }
2020 
2021         @Override
2022         public DispatchMergingRegionsResponse dispatchMergingRegions(
2023             RpcController controller, DispatchMergingRegionsRequest request)
2024             throws ServiceException {
2025           return stub.dispatchMergingRegions(controller, request);
2026         }
2027 
2028         @Override
2029         public AssignRegionResponse assignRegion(RpcController controller,
2030             AssignRegionRequest request) throws ServiceException {
2031           return stub.assignRegion(controller, request);
2032         }
2033 
2034         @Override
2035         public UnassignRegionResponse unassignRegion(RpcController controller,
2036             UnassignRegionRequest request) throws ServiceException {
2037           return stub.unassignRegion(controller, request);
2038         }
2039 
2040         @Override
2041         public OfflineRegionResponse offlineRegion(RpcController controller,
2042             OfflineRegionRequest request) throws ServiceException {
2043           return stub.offlineRegion(controller, request);
2044         }
2045 
2046         @Override
2047         public DeleteTableResponse deleteTable(RpcController controller,
2048             DeleteTableRequest request) throws ServiceException {
2049           return stub.deleteTable(controller, request);
2050         }
2051 
2052         @Override
2053         public EnableTableResponse enableTable(RpcController controller,
2054             EnableTableRequest request) throws ServiceException {
2055           return stub.enableTable(controller, request);
2056         }
2057 
2058         @Override
2059         public DisableTableResponse disableTable(RpcController controller,
2060             DisableTableRequest request) throws ServiceException {
2061           return stub.disableTable(controller, request);
2062         }
2063 
2064         @Override
2065         public ModifyTableResponse modifyTable(RpcController controller,
2066             ModifyTableRequest request) throws ServiceException {
2067           return stub.modifyTable(controller, request);
2068         }
2069 
2070         @Override
2071         public CreateTableResponse createTable(RpcController controller,
2072             CreateTableRequest request) throws ServiceException {
2073           return stub.createTable(controller, request);
2074         }
2075 
2076         @Override
2077         public ShutdownResponse shutdown(RpcController controller,
2078             ShutdownRequest request) throws ServiceException {
2079           return stub.shutdown(controller, request);
2080         }
2081 
2082         @Override
2083         public StopMasterResponse stopMaster(RpcController controller,
2084             StopMasterRequest request) throws ServiceException {
2085           return stub.stopMaster(controller, request);
2086         }
2087 
2088         @Override
2089         public BalanceResponse balance(RpcController controller,
2090             BalanceRequest request) throws ServiceException {
2091           return stub.balance(controller, request);
2092         }
2093 
2094         @Override
2095         public SetBalancerRunningResponse setBalancerRunning(
2096             RpcController controller, SetBalancerRunningRequest request)
2097             throws ServiceException {
2098           return stub.setBalancerRunning(controller, request);
2099         }
2100 
2101         @Override
2102         public IsBalancerEnabledResponse isBalancerEnabled(RpcController controller,
2103             IsBalancerEnabledRequest request) throws ServiceException {
2104           return stub.isBalancerEnabled(controller, request);
2105         }
2106 
2107         @Override
2108         public RunCatalogScanResponse runCatalogScan(RpcController controller,
2109             RunCatalogScanRequest request) throws ServiceException {
2110           return stub.runCatalogScan(controller, request);
2111         }
2112 
2113         @Override
2114         public EnableCatalogJanitorResponse enableCatalogJanitor(
2115             RpcController controller, EnableCatalogJanitorRequest request)
2116             throws ServiceException {
2117           return stub.enableCatalogJanitor(controller, request);
2118         }
2119 
2120         @Override
2121         public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(
2122             RpcController controller, IsCatalogJanitorEnabledRequest request)
2123             throws ServiceException {
2124           return stub.isCatalogJanitorEnabled(controller, request);
2125         }
2126 
2127         @Override
2128         public CoprocessorServiceResponse execMasterService(
2129             RpcController controller, CoprocessorServiceRequest request)
2130             throws ServiceException {
2131           return stub.execMasterService(controller, request);
2132         }
2133 
2134         @Override
2135         public SnapshotResponse snapshot(RpcController controller,
2136             SnapshotRequest request) throws ServiceException {
2137           return stub.snapshot(controller, request);
2138         }
2139 
2140         @Override
2141         public GetCompletedSnapshotsResponse getCompletedSnapshots(
2142             RpcController controller, GetCompletedSnapshotsRequest request)
2143             throws ServiceException {
2144           return stub.getCompletedSnapshots(controller, request);
2145         }
2146 
2147         @Override
2148         public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2149             DeleteSnapshotRequest request) throws ServiceException {
2150           return stub.deleteSnapshot(controller, request);
2151         }
2152 
2153         @Override
2154         public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2155             IsSnapshotDoneRequest request) throws ServiceException {
2156           return stub.isSnapshotDone(controller, request);
2157         }
2158 
2159         @Override
2160         public RestoreSnapshotResponse restoreSnapshot(
2161             RpcController controller, RestoreSnapshotRequest request)
2162             throws ServiceException {
2163           return stub.restoreSnapshot(controller, request);
2164         }
2165 
2166         @Override
2167         public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(
2168             RpcController controller, IsRestoreSnapshotDoneRequest request)
2169             throws ServiceException {
2170           return stub.isRestoreSnapshotDone(controller, request);
2171         }
2172 
2173         @Override
2174         public ExecProcedureResponse execProcedure(
2175             RpcController controller, ExecProcedureRequest request)
2176             throws ServiceException {
2177           return stub.execProcedure(controller, request);
2178         }
2179 
2180         @Override
2181         public IsProcedureDoneResponse isProcedureDone(RpcController controller,
2182             IsProcedureDoneRequest request) throws ServiceException {
2183           return stub.isProcedureDone(controller, request);
2184         }
2185 
2186         @Override
2187         public IsMasterRunningResponse isMasterRunning(
2188             RpcController controller, IsMasterRunningRequest request)
2189             throws ServiceException {
2190           return stub.isMasterRunning(controller, request);
2191         }
2192 
2193         @Override
2194         public ModifyNamespaceResponse modifyNamespace(RpcController controller,
2195             ModifyNamespaceRequest request)
2196         throws ServiceException {
2197           return stub.modifyNamespace(controller, request);
2198         }
2199 
2200         @Override
2201         public CreateNamespaceResponse createNamespace(RpcController controller, CreateNamespaceRequest request) throws ServiceException {
2202           return stub.createNamespace(controller, request);
2203         }
2204 
2205         @Override
2206         public DeleteNamespaceResponse deleteNamespace(RpcController controller, DeleteNamespaceRequest request) throws ServiceException {
2207           return stub.deleteNamespace(controller, request);
2208         }
2209 
2210         @Override
2211         public GetNamespaceDescriptorResponse getNamespaceDescriptor(RpcController controller, GetNamespaceDescriptorRequest request) throws ServiceException {
2212           return stub.getNamespaceDescriptor(controller, request);
2213         }
2214 
2215         @Override
2216         public ListNamespaceDescriptorsResponse listNamespaceDescriptors(RpcController controller, ListNamespaceDescriptorsRequest request) throws ServiceException {
2217           return stub.listNamespaceDescriptors(controller, request);
2218         }
2219 
2220         @Override
2221         public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(RpcController controller, ListTableDescriptorsByNamespaceRequest request) throws ServiceException {
2222           return stub.listTableDescriptorsByNamespace(controller, request);
2223         }
2224 
2225         @Override
2226         public ListTableNamesByNamespaceResponse listTableNamesByNamespace(RpcController controller,
2227               ListTableNamesByNamespaceRequest request) throws ServiceException {
2228           return stub.listTableNamesByNamespace(controller, request);
2229         }
2230 
2231         @Override
2232         public void close() {
2233           release(this.mss);
2234         }
2235 
2236         @Override
2237         public GetSchemaAlterStatusResponse getSchemaAlterStatus(
2238             RpcController controller, GetSchemaAlterStatusRequest request)
2239             throws ServiceException {
2240           return stub.getSchemaAlterStatus(controller, request);
2241         }
2242 
2243         @Override
2244         public GetTableDescriptorsResponse getTableDescriptors(
2245             RpcController controller, GetTableDescriptorsRequest request)
2246             throws ServiceException {
2247           return stub.getTableDescriptors(controller, request);
2248         }
2249 
2250         @Override
2251         public GetTableNamesResponse getTableNames(
2252             RpcController controller, GetTableNamesRequest request)
2253             throws ServiceException {
2254           return stub.getTableNames(controller, request);
2255         }
2256 
2257         @Override
2258         public GetClusterStatusResponse getClusterStatus(
2259             RpcController controller, GetClusterStatusRequest request)
2260             throws ServiceException {
2261           return stub.getClusterStatus(controller, request);
2262         }
2263 
2264         @Override
2265         public TruncateTableResponse truncateTable(RpcController controller,
2266             TruncateTableRequest request) throws ServiceException {
2267           return stub.truncateTable(controller, request);
2268         }
2269 
2270         @Override
2271         public SecurityCapabilitiesResponse getSecurityCapabilities(RpcController controller,
2272             SecurityCapabilitiesRequest request) throws ServiceException {
2273           return stub.getSecurityCapabilities(controller, request);
2274         }
2275       };
2276     }
2277 
2278 
2279     private static void release(MasterServiceState mss) {
2280       if (mss != null && mss.connection != null) {
2281         ((HConnectionImplementation)mss.connection).releaseMaster(mss);
2282       }
2283     }
2284 
2285     private boolean isKeepAliveMasterConnectedAndRunning(MasterServiceState mss) {
2286       if (mss.getStub() == null){
2287         return false;
2288       }
2289       try {
2290         return mss.isMasterRunning();
2291       } catch (UndeclaredThrowableException e) {
2292         // It's somehow messy, but we can receive exceptions such as
2293         //  java.net.ConnectException but they're not declared. So we catch it...
2294         LOG.info("Master connection is not running anymore", e.getUndeclaredThrowable());
2295         return false;
2296       } catch (ServiceException se) {
2297         LOG.warn("Checking master connection", se);
2298         return false;
2299       }
2300     }
2301 
2302     void releaseMaster(MasterServiceState mss) {
2303       if (mss.getStub() == null) return;
2304       synchronized (masterAndZKLock) {
2305         --mss.userCount;
2306         if (mss.userCount <= 0) {
2307           mss.keepAliveUntil = System.currentTimeMillis() + keepAlive;
2308         }
2309       }
2310     }
2311 
2312     private void closeMasterService(MasterServiceState mss) {
2313       if (mss.getStub() != null) {
2314         LOG.info("Closing master protocol: " + mss);
2315         mss.clearStub();
2316       }
2317       mss.userCount = 0;
2318     }
2319 
2320     /**
2321      * Immediate close of the shared master. Can be by the delayed close or when closing the
2322      * connection itself.
2323      */
2324     private void closeMaster() {
2325       synchronized (masterAndZKLock) {
2326         closeMasterService(masterServiceState);
2327       }
2328     }
2329 
2330     void updateCachedLocation(HRegionInfo hri, HRegionLocation source,
2331                               ServerName serverName, long seqNum) {
2332       HRegionLocation newHrl = new HRegionLocation(hri, serverName, seqNum);
2333       cacheLocation(hri.getTable(), source, newHrl);
2334     }
2335 
2336    /**
2337     * Deletes the cached location of the region if necessary, based on some error from source.
2338     * @param hri The region in question.
2339     * @param source The source of the error that prompts us to invalidate cache.
2340     */
2341    void deleteCachedLocation(HRegionInfo hri, HRegionLocation source) {
2342      ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(hri.getTable());
2343      tableLocations.remove(hri.getStartKey(), source);
2344    }
2345 
2346     @Override
2347     public void deleteCachedRegionLocation(final HRegionLocation location) {
2348       if (location == null) {
2349         return;
2350       }
2351 
2352       HRegionLocation removedLocation;
2353       TableName tableName = location.getRegionInfo().getTable();
2354       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
2355       removedLocation = tableLocations.remove(location.getRegionInfo().getStartKey());
2356       if (LOG.isDebugEnabled() && removedLocation != null) {
2357         LOG.debug("Removed " +
2358             location.getRegionInfo().getRegionNameAsString() +
2359             " for tableName=" + tableName +
2360             " from cache");
2361       }
2362     }
2363 
2364     /**
2365      * Update the location with the new value (if the exception is a RegionMovedException)
2366      * or delete it from the cache. Does nothing if we can be sure from the exception that
2367      * the location is still accurate, or if the cache has already been updated.
2368      * @param exception an object (to simplify user code) on which we will try to find a nested
2369      *                  or wrapped or both RegionMovedException
2370      * @param source server that is the source of the location update.
2371      */
2372     @Override
2373     public void updateCachedLocations(final TableName tableName, byte[] rowkey,
2374       final Object exception, final HRegionLocation source) {
2375       if (rowkey == null || tableName == null) {
2376         LOG.warn("Coding error, see method javadoc. row=" + (rowkey == null ? "null" : rowkey) +
2377             ", tableName=" + (tableName == null ? "null" : tableName));
2378         return;
2379       }
2380 
2381       if (source == null || source.getServerName() == null){
2382         // This should not happen, but let's secure ourselves.
2383         return;
2384       }
2385 
2386       // Is it something we have already updated?
2387       final HRegionLocation oldLocation = getCachedLocation(tableName, rowkey);
2388       if (oldLocation == null || !source.getServerName().equals(oldLocation.getServerName())) {
2389         // There is no such location in the cache (it's been removed already) or
2390         // the cache has already been refreshed with a different location.  => nothing to do
2391         return;
2392       }
2393 
2394       HRegionInfo regionInfo = oldLocation.getRegionInfo();
2395       Throwable cause = findException(exception);
2396       if (cause != null) {
2397         if (cause instanceof RegionTooBusyException || cause instanceof RegionOpeningException) {
2398           // We know that the region is still on this region server
2399           return;
2400         }
2401 
2402         if (cause instanceof RegionMovedException) {
2403           RegionMovedException rme = (RegionMovedException) cause;
2404           if (LOG.isTraceEnabled()) {
2405             LOG.trace("Region " + regionInfo.getRegionNameAsString() + " moved to " +
2406                 rme.getHostname() + ":" + rme.getPort() +
2407                 " according to " + source.getHostnamePort());
2408           }
2409           // We know that the region is not anymore on this region server, but we know
2410           //  the new location.
2411           updateCachedLocation(
2412               regionInfo, source, rme.getServerName(), rme.getLocationSeqNum());
2413           return;
2414         }
2415       }
2416 
2417       // If we're here, it means that can cannot be sure about the location, so we remove it from
2418       //  the cache.
2419       deleteCachedLocation(regionInfo, source);
2420     }
2421 
2422     @Override
2423     public void updateCachedLocations(final byte[] tableName, byte[] rowkey,
2424       final Object exception, final HRegionLocation source) {
2425       updateCachedLocations(TableName.valueOf(tableName), rowkey, exception, source);
2426     }
2427 
2428     @Override
2429     @Deprecated
2430     public void processBatch(List<? extends Row> list,
2431         final TableName tableName,
2432         ExecutorService pool,
2433         Object[] results) throws IOException, InterruptedException {
2434       // This belongs in HTable!!! Not in here.  St.Ack
2435 
2436       // results must be the same size as list
2437       if (results.length != list.size()) {
2438         throw new IllegalArgumentException(
2439           "argument results must be the same size as argument list");
2440       }
2441       processBatchCallback(list, tableName, pool, results, null);
2442     }
2443 
2444     @Override
2445     @Deprecated
2446     public void processBatch(List<? extends Row> list,
2447         final byte[] tableName,
2448         ExecutorService pool,
2449         Object[] results) throws IOException, InterruptedException {
2450       processBatch(list, TableName.valueOf(tableName), pool, results);
2451     }
2452 
2453     /**
2454      * Send the queries in parallel on the different region servers. Retries on failures.
2455      * If the method returns it means that there is no error, and the 'results' array will
2456      * contain no exception. On error, an exception is thrown, and the 'results' array will
2457      * contain results and exceptions.
2458      * @deprecated since 0.96 - Use {@link HTable#processBatchCallback} instead
2459      */
2460     @Override
2461     @Deprecated
2462     public <R> void processBatchCallback(
2463       List<? extends Row> list,
2464       TableName tableName,
2465       ExecutorService pool,
2466       Object[] results,
2467       Batch.Callback<R> callback)
2468       throws IOException, InterruptedException {
2469 
2470       // To fulfill the original contract, we have a special callback. This callback
2471       //  will set the results in the Object array.
2472       ObjectResultFiller<R> cb = new ObjectResultFiller<R>(results, callback);
2473       AsyncProcess<?> asyncProcess = createAsyncProcess(tableName, pool, cb, conf);
2474 
2475       // We're doing a submit all. This way, the originalIndex will match the initial list.
2476       asyncProcess.submitAll(list);
2477       asyncProcess.waitUntilDone();
2478 
2479       if (asyncProcess.hasError()) {
2480         throw asyncProcess.getErrors();
2481       }
2482     }
2483 
2484     @Override
2485     @Deprecated
2486     public <R> void processBatchCallback(
2487       List<? extends Row> list,
2488       byte[] tableName,
2489       ExecutorService pool,
2490       Object[] results,
2491       Batch.Callback<R> callback)
2492       throws IOException, InterruptedException {
2493       processBatchCallback(list, TableName.valueOf(tableName), pool, results, callback);
2494     }
2495 
2496     // For tests.
2497     protected <R> AsyncProcess createAsyncProcess(TableName tableName, ExecutorService pool,
2498            AsyncProcess.AsyncProcessCallback<R> callback, Configuration conf) {
2499       RpcControllerFactory controllerFactory = RpcControllerFactory.instantiate(conf);
2500       RpcRetryingCallerFactory callerFactory = RpcRetryingCallerFactory.instantiate(conf, this.stats);
2501       return new AsyncProcess<R>(this, tableName, pool, callback, conf, callerFactory,
2502         controllerFactory);
2503     }
2504 
2505     /**
2506      * Fill the result array for the interfaces using it.
2507      */
2508     private static class ObjectResultFiller<Res>
2509         implements AsyncProcess.AsyncProcessCallback<Res> {
2510 
2511       private final Object[] results;
2512       private Batch.Callback<Res> callback;
2513 
2514       ObjectResultFiller(Object[] results, Batch.Callback<Res> callback) {
2515         this.results = results;
2516         this.callback = callback;
2517       }
2518 
2519       @Override
2520       public void success(int pos, byte[] region, Row row, Res result) {
2521         assert pos < results.length;
2522         results[pos] = result;
2523         if (callback != null) {
2524           callback.update(region, row.getRow(), result);
2525         }
2526       }
2527 
2528       @Override
2529       public boolean failure(int pos, byte[] region, Row row, Throwable t) {
2530         assert pos < results.length;
2531         results[pos] = t;
2532         //Batch.Callback<Res> was not called on failure in 0.94. We keep this.
2533         return true; // we want to have this failure in the failures list.
2534       }
2535 
2536       @Override
2537       public boolean retriableFailure(int originalIndex, Row row, byte[] region,
2538                                       Throwable exception) {
2539         return true; // we retry
2540       }
2541     }
2542 
2543     @Override
2544     public ServerStatisticTracker getStatisticsTracker() {
2545       return this.stats;
2546     }
2547 
2548     @Override
2549     public ClientBackoffPolicy getBackoffPolicy() {
2550       return this.backoffPolicy;
2551     }
2552 
2553     /*
2554      * Return the number of cached region for a table. It will only be called
2555      * from a unit test.
2556      */
2557     int getNumberOfCachedRegionLocations(final TableName tableName) {
2558       Map<byte[], HRegionLocation> tableLocs = this.cachedRegionLocations.get(tableName);
2559       if (tableLocs == null) {
2560         return 0;
2561       }
2562       return tableLocs.values().size();
2563     }
2564 
2565     /**
2566      * Check the region cache to see whether a region is cached yet or not.
2567      * Called by unit tests.
2568      * @param tableName tableName
2569      * @param row row
2570      * @return Region cached or not.
2571      */
2572     boolean isRegionCached(TableName tableName, final byte[] row) {
2573       HRegionLocation location = getCachedLocation(tableName, row);
2574       return location != null;
2575     }
2576 
2577     @Override
2578     public void setRegionCachePrefetch(final TableName tableName,
2579         final boolean enable) {
2580       if (!enable) {
2581         regionCachePrefetchDisabledTables.add(Bytes.mapKey(tableName.getName()));
2582       }
2583       else {
2584         regionCachePrefetchDisabledTables.remove(Bytes.mapKey(tableName.getName()));
2585       }
2586     }
2587 
2588     @Override
2589     public void setRegionCachePrefetch(final byte[] tableName,
2590         final boolean enable) {
2591       setRegionCachePrefetch(TableName.valueOf(tableName), enable);
2592     }
2593 
2594     @Override
2595     public boolean getRegionCachePrefetch(TableName tableName) {
2596       return usePrefetch &&
2597           !regionCachePrefetchDisabledTables.contains(Bytes.mapKey(tableName.getName()));
2598     }
2599 
2600     @Override
2601     public boolean getRegionCachePrefetch(byte[] tableName) {
2602       return getRegionCachePrefetch(TableName.valueOf(tableName));
2603     }
2604 
2605     @Override
2606     public void abort(final String msg, Throwable t) {
2607       if (t instanceof KeeperException.SessionExpiredException
2608         && keepAliveZookeeper != null) {
2609         synchronized (masterAndZKLock) {
2610           if (keepAliveZookeeper != null) {
2611             LOG.warn("This client just lost it's session with ZooKeeper," +
2612               " closing it." +
2613               " It will be recreated next time someone needs it", t);
2614             closeZooKeeperWatcher();
2615           }
2616         }
2617       } else {
2618         if (t != null) {
2619           LOG.fatal(msg, t);
2620         } else {
2621           LOG.fatal(msg);
2622         }
2623         this.aborted = true;
2624         close();
2625         this.closed = true;
2626       }
2627     }
2628 
2629     @Override
2630     public boolean isClosed() {
2631       return this.closed;
2632     }
2633 
2634     @Override
2635     public boolean isAborted(){
2636       return this.aborted;
2637     }
2638 
2639     @Override
2640     public int getCurrentNrHRS() throws IOException {
2641       return this.registry.getCurrentNrHRS();
2642     }
2643 
2644     /**
2645      * Increment this client's reference count.
2646      */
2647     void incCount() {
2648       ++refCount;
2649     }
2650 
2651     /**
2652      * Decrement this client's reference count.
2653      */
2654     void decCount() {
2655       if (refCount > 0) {
2656         --refCount;
2657       }
2658     }
2659 
2660     /**
2661      * Return if this client has no reference
2662      *
2663      * @return true if this client has no reference; false otherwise
2664      */
2665     boolean isZeroReference() {
2666       return refCount == 0;
2667     }
2668 
2669     void internalClose() {
2670       if (this.closed) {
2671         return;
2672       }
2673       delayedClosing.stop("Closing connection");
2674       closeMaster();
2675       shutdownBatchPool();
2676       if (this.metrics != null) {
2677         this.metrics.shutdown();
2678       }
2679       this.closed = true;
2680       closeZooKeeperWatcher();
2681       this.stubs.clear();
2682       if (clusterStatusListener != null) {
2683         clusterStatusListener.close();
2684       }
2685       if (rpcClient != null) {
2686         rpcClient.stop();
2687       }
2688     }
2689 
2690     @Override
2691     public void close() {
2692       if (managed) {
2693         if (aborted) {
2694           HConnectionManager.deleteStaleConnection(this);
2695         } else {
2696           HConnectionManager.deleteConnection(this, false);
2697         }
2698       } else {
2699         internalClose();
2700       }
2701     }
2702 
2703     /**
2704      * Close the connection for good, regardless of what the current value of
2705      * {@link #refCount} is. Ideally, {@link #refCount} should be zero at this
2706      * point, which would be the case if all of its consumers close the
2707      * connection. However, on the off chance that someone is unable to close
2708      * the connection, perhaps because it bailed out prematurely, the method
2709      * below will ensure that this {@link HConnection} instance is cleaned up.
2710      * Caveat: The JVM may take an unknown amount of time to call finalize on an
2711      * unreachable object, so our hope is that every consumer cleans up after
2712      * itself, like any good citizen.
2713      */
2714     @Override
2715     protected void finalize() throws Throwable {
2716       super.finalize();
2717       // Pretend as if we are about to release the last remaining reference
2718       refCount = 1;
2719       close();
2720     }
2721 
2722     @Override
2723     public HTableDescriptor[] listTables() throws IOException {
2724       MasterKeepAliveConnection master = getKeepAliveMasterService();
2725       try {
2726         GetTableDescriptorsRequest req =
2727           RequestConverter.buildGetTableDescriptorsRequest((List<TableName>)null);
2728         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2729       } catch (ServiceException se) {
2730         throw ProtobufUtil.getRemoteException(se);
2731       } finally {
2732         master.close();
2733       }
2734     }
2735 
2736     @Override
2737     public String[] getTableNames() throws IOException {
2738       TableName[] tableNames = listTableNames();
2739       String result[] = new String[tableNames.length];
2740       for (int i = 0; i < tableNames.length; i++) {
2741         result[i] = tableNames[i].getNameAsString();
2742       }
2743       return result;
2744     }
2745 
2746     @Override
2747     public TableName[] listTableNames() throws IOException {
2748       MasterKeepAliveConnection master = getKeepAliveMasterService();
2749       try {
2750         return ProtobufUtil.getTableNameArray(master.getTableNames(null,
2751             GetTableNamesRequest.newBuilder().build())
2752           .getTableNamesList());
2753       } catch (ServiceException se) {
2754         throw ProtobufUtil.getRemoteException(se);
2755       } finally {
2756         master.close();
2757       }
2758     }
2759 
2760     @Override
2761     public HTableDescriptor[] getHTableDescriptorsByTableName(
2762         List<TableName> tableNames) throws IOException {
2763       if (tableNames == null || tableNames.isEmpty()) return new HTableDescriptor[0];
2764       MasterKeepAliveConnection master = getKeepAliveMasterService();
2765       try {
2766         GetTableDescriptorsRequest req =
2767           RequestConverter.buildGetTableDescriptorsRequest(tableNames);
2768         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2769       } catch (ServiceException se) {
2770         throw ProtobufUtil.getRemoteException(se);
2771       } finally {
2772         master.close();
2773       }
2774     }
2775 
2776     @Override
2777     public HTableDescriptor[] getHTableDescriptors(
2778         List<String> names) throws IOException {
2779       List<TableName> tableNames = new ArrayList(names.size());
2780       for(String name : names) {
2781         tableNames.add(TableName.valueOf(name));
2782       }
2783 
2784       return getHTableDescriptorsByTableName(tableNames);
2785     }
2786 
2787     @Override
2788     public NonceGenerator getNonceGenerator() {
2789       return this.nonceGenerator;
2790     }
2791 
2792     /**
2793      * Connects to the master to get the table descriptor.
2794      * @param tableName table name
2795      * @return
2796      * @throws IOException if the connection to master fails or if the table
2797      *  is not found.
2798      */
2799     @Override
2800     public HTableDescriptor getHTableDescriptor(final TableName tableName)
2801     throws IOException {
2802       if (tableName == null) return null;
2803       MasterKeepAliveConnection master = getKeepAliveMasterService();
2804       GetTableDescriptorsResponse htds;
2805       try {
2806         GetTableDescriptorsRequest req =
2807           RequestConverter.buildGetTableDescriptorsRequest(tableName);
2808         htds = master.getTableDescriptors(null, req);
2809       } catch (ServiceException se) {
2810         throw ProtobufUtil.getRemoteException(se);
2811       } finally {
2812         master.close();
2813       }
2814       if (!htds.getTableSchemaList().isEmpty()) {
2815         return HTableDescriptor.convert(htds.getTableSchemaList().get(0));
2816       }
2817       throw new TableNotFoundException(tableName.getNameAsString());
2818     }
2819 
2820     @Override
2821     public HTableDescriptor getHTableDescriptor(final byte[] tableName)
2822     throws IOException {
2823       return getHTableDescriptor(TableName.valueOf(tableName));
2824     }
2825   }
2826 
2827   /**
2828    * The record of errors for servers.
2829    */
2830   static class ServerErrorTracker {
2831     // We need a concurrent map here, as we could have multiple threads updating it in parallel.
2832     private final ConcurrentMap<HRegionLocation, ServerErrors> errorsByServer =
2833         new ConcurrentHashMap<HRegionLocation, ServerErrors>();
2834     private final long canRetryUntil;
2835     private final int maxRetries;
2836     private final String startTrackingTime;
2837 
2838     public ServerErrorTracker(long timeout, int maxRetries) {
2839       this.maxRetries = maxRetries;
2840       this.canRetryUntil = EnvironmentEdgeManager.currentTimeMillis() + timeout;
2841       this.startTrackingTime = new Date().toString();
2842     }
2843 
2844     /**
2845      * We stop to retry when we have exhausted BOTH the number of retries and the time allocated.
2846      */
2847     boolean canRetryMore(int numRetry) {
2848       // If there is a single try we must not take into account the time.
2849       return numRetry < maxRetries || (maxRetries > 1 &&
2850           EnvironmentEdgeManager.currentTimeMillis() < this.canRetryUntil);
2851     }
2852 
2853     /**
2854      * Calculates the back-off time for a retrying request to a particular server.
2855      *
2856      * @param server    The server in question.
2857      * @param basePause The default hci pause.
2858      * @return The time to wait before sending next request.
2859      */
2860     long calculateBackoffTime(HRegionLocation server, long basePause) {
2861       long result;
2862       ServerErrors errorStats = errorsByServer.get(server);
2863       if (errorStats != null) {
2864         result = ConnectionUtils.getPauseTime(basePause, errorStats.retries.get());
2865       } else {
2866         result = 0; // yes, if the server is not in our list we don't wait before retrying.
2867       }
2868       return result;
2869     }
2870 
2871     /**
2872      * Reports that there was an error on the server to do whatever bean-counting necessary.
2873      *
2874      * @param server The server in question.
2875      */
2876     void reportServerError(HRegionLocation server) {
2877       ServerErrors errors = errorsByServer.get(server);
2878       if (errors != null) {
2879         errors.addError();
2880       } else {
2881         errors = errorsByServer.putIfAbsent(server, new ServerErrors());
2882         if (errors != null){
2883           errors.addError();
2884         }
2885       }
2886     }
2887 
2888     String getStartTrackingTime() {
2889       return startTrackingTime;
2890     }
2891 
2892     /**
2893      * The record of errors for a server.
2894      */
2895     private static class ServerErrors {
2896       public final AtomicInteger retries = new AtomicInteger(0);
2897 
2898       public void addError() {
2899         retries.incrementAndGet();
2900       }
2901     }
2902   }
2903 
2904   /**
2905    * Look for an exception we know in the remote exception:
2906    * - hadoop.ipc wrapped exceptions
2907    * - nested exceptions
2908    *
2909    * Looks for: RegionMovedException / RegionOpeningException / RegionTooBusyException
2910    * @return null if we didn't find the exception, the exception otherwise.
2911    */
2912   public static Throwable findException(Object exception) {
2913     if (exception == null || !(exception instanceof Throwable)) {
2914       return null;
2915     }
2916     Throwable cur = (Throwable) exception;
2917     while (cur != null) {
2918       if (cur instanceof RegionMovedException || cur instanceof RegionOpeningException
2919           || cur instanceof RegionTooBusyException) {
2920         return cur;
2921       }
2922       if (cur instanceof RemoteException) {
2923         RemoteException re = (RemoteException) cur;
2924         cur = re.unwrapRemoteException(
2925             RegionOpeningException.class, RegionMovedException.class,
2926             RegionTooBusyException.class);
2927         if (cur == null) {
2928           cur = re.unwrapRemoteException();
2929         }
2930         // unwrapRemoteException can return the exception given as a parameter when it cannot
2931         //  unwrap it. In this case, there is no need to look further
2932         // noinspection ObjectEquality
2933         if (cur == re) {
2934           return null;
2935         }
2936       } else {
2937         cur = cur.getCause();
2938       }
2939     }
2940 
2941     return null;
2942   }
2943 
2944   /**
2945    * Set the number of retries to use serverside when trying to communicate
2946    * with another server over {@link HConnection}.  Used updating catalog
2947    * tables, etc.  Call this method before we create any Connections.
2948    * @param c The Configuration instance to set the retries into.
2949    * @param log Used to log what we set in here.
2950    */
2951   public static void setServerSideHConnectionRetries(final Configuration c, final String sn,
2952       final Log log) {
2953     int hcRetries = c.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
2954       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
2955     // Go big.  Multiply by 10.  If we can't get to meta after this many retries
2956     // then something seriously wrong.
2957     int serversideMultiplier = c.getInt("hbase.client.serverside.retries.multiplier", 10);
2958     int retries = hcRetries * serversideMultiplier;
2959     c.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
2960     log.debug(sn + " HConnection server-to-server retries=" + retries);
2961   }
2962 }