View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import java.io.Closeable;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.lang.reflect.Constructor;
25  import java.lang.reflect.UndeclaredThrowableException;
26  import java.net.SocketException;
27  import java.util.ArrayList;
28  import java.util.Date;
29  import java.util.HashSet;
30  import java.util.LinkedHashMap;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.Map.Entry;
34  import java.util.NavigableMap;
35  import java.util.Set;
36  import java.util.concurrent.ConcurrentHashMap;
37  import java.util.concurrent.ConcurrentMap;
38  import java.util.concurrent.ConcurrentSkipListMap;
39  import java.util.concurrent.ConcurrentSkipListSet;
40  import java.util.concurrent.CopyOnWriteArraySet;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.LinkedBlockingQueue;
43  import java.util.concurrent.ThreadPoolExecutor;
44  import java.util.concurrent.TimeUnit;
45  import java.util.concurrent.atomic.AtomicBoolean;
46  import java.util.concurrent.atomic.AtomicInteger;
47  
48  import org.apache.commons.logging.Log;
49  import org.apache.commons.logging.LogFactory;
50  import org.apache.hadoop.hbase.classification.InterfaceAudience;
51  import org.apache.hadoop.hbase.classification.InterfaceStability;
52  import org.apache.hadoop.conf.Configuration;
53  import org.apache.hadoop.hbase.Chore;
54  import org.apache.hadoop.hbase.HBaseConfiguration;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HRegionLocation;
58  import org.apache.hadoop.hbase.HTableDescriptor;
59  import org.apache.hadoop.hbase.MasterNotRunningException;
60  import org.apache.hadoop.hbase.RegionTooBusyException;
61  import org.apache.hadoop.hbase.ServerName;
62  import org.apache.hadoop.hbase.Stoppable;
63  import org.apache.hadoop.hbase.TableName;
64  import org.apache.hadoop.hbase.TableNotEnabledException;
65  import org.apache.hadoop.hbase.TableNotFoundException;
66  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
67  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
68  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
69  import org.apache.hadoop.hbase.client.coprocessor.Batch;
70  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
71  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
72  import org.apache.hadoop.hbase.ipc.RpcClient;
73  import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
74  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
75  import org.apache.hadoop.hbase.protobuf.RequestConverter;
76  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
77  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService;
78  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
79  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
80  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
81  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
82  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionRequest;
83  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
84  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
85  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
86  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceRequest;
87  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceResponse;
88  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableRequest;
89  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
90  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnRequest;
91  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
92  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceRequest;
93  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceResponse;
94  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotRequest;
95  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
96  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableRequest;
97  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
98  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableRequest;
99  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
100 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsRequest;
101 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
102 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorRequest;
103 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
104 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableRequest;
105 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
106 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest;
107 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse;
108 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusRequest;
109 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusResponse;
110 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsRequest;
111 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
112 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorRequest;
113 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorResponse;
114 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusRequest;
115 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusResponse;
116 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
117 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
118 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
119 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesResponse;
120 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
123 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
124 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest;
125 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse;
126 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
128 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneRequest;
129 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
130 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsRequest;
131 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
132 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
133 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
134 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
135 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
136 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
137 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnRequest;
138 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
139 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceRequest;
140 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
141 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableRequest;
142 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
143 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionRequest;
144 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
145 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionRequest;
146 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
147 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotRequest;
148 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
149 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanRequest;
150 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
151 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
152 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
153 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotRequest;
154 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
155 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownRequest;
156 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
157 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterRequest;
158 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
159 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableRequest;
160 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableResponse;
161 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionRequest;
162 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
163 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
164 import org.apache.hadoop.hbase.security.User;
165 import org.apache.hadoop.hbase.security.UserProvider;
166 import org.apache.hadoop.hbase.util.Bytes;
167 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
168 import org.apache.hadoop.hbase.util.ExceptionUtil;
169 import org.apache.hadoop.hbase.util.Threads;
170 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
171 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
172 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
173 import org.apache.hadoop.ipc.RemoteException;
174 import org.apache.zookeeper.KeeperException;
175 
176 import com.google.common.annotations.VisibleForTesting;
177 import com.google.protobuf.BlockingRpcChannel;
178 import com.google.protobuf.RpcController;
179 import com.google.protobuf.ServiceException;
180 
181 /**
182  * A non-instantiable class that manages creation of {@link HConnection}s.
183  * <p>The simplest way to use this class is by using {@link #createConnection(Configuration)}.
184  * This creates a new {@link HConnection} to the cluster that is managed by the caller.
185  * From this {@link HConnection} {@link HTableInterface} implementations are retrieved
186  * with {@link HConnection#getTable(byte[])}. Example:
187  * <pre>
188  * {@code
189  * HConnection connection = HConnectionManager.createConnection(config);
190  * HTableInterface table = connection.getTable("table1");
191  * try {
192  *   // Use the table as needed, for a single operation and a single thread
193  * } finally {
194  *   table.close();
195  *   connection.close();
196  * }
197  * }</pre>
198  * <p>This class has a static Map of {@link HConnection} instances keyed by
199  * {@link HConnectionKey}; A {@link HConnectionKey} is identified by a set of
200  * {@link Configuration} properties. Invocations of {@link #getConnection(Configuration)}
201  * that pass the same {@link Configuration} instance will return the same
202  * {@link  HConnection} instance ONLY WHEN the set of properties are the same
203  * (i.e. if you change properties in your {@link Configuration} instance, such as RPC timeout,
204  * the codec used, HBase will create a new {@link HConnection} instance. For more details on
205  * how this is done see {@link HConnectionKey}).
206  * <p>Sharing {@link HConnection} instances is usually what you want; all clients
207  * of the {@link HConnection} instances share the HConnections' cache of Region
208  * locations rather than each having to discover for itself the location of meta, etc.
209  * But sharing connections makes clean up of {@link HConnection} instances a little awkward.
210  * Currently, clients cleanup by calling {@link #deleteConnection(Configuration)}. This will
211  * shutdown the zookeeper connection the HConnection was using and clean up all
212  * HConnection resources as well as stopping proxies to servers out on the
213  * cluster. Not running the cleanup will not end the world; it'll
214  * just stall the closeup some and spew some zookeeper connection failed
215  * messages into the log.  Running the cleanup on a {@link HConnection} that is
216  * subsequently used by another will cause breakage so be careful running
217  * cleanup.
218  * <p>To create a {@link HConnection} that is not shared by others, you can
219  * set property "hbase.client.instance.id" to a unique value for your {@link Configuration}
220  * instance, like the following:
221  * <pre>
222  * {@code
223  * conf.set("hbase.client.instance.id", "12345");
224  * HConnection connection = HConnectionManager.getConnection(conf);
225  * // Use the connection to your hearts' delight and then when done...
226  * conf.set("hbase.client.instance.id", "12345");
227  * HConnectionManager.deleteConnection(conf, true);
228  * }
229  * </pre>
230  * <p>Cleanup used to be done inside in a shutdown hook.  On startup we'd
231  * register a shutdown hook that called {@link #deleteAllConnections()}
232  * on its way out but the order in which shutdown hooks run is not defined so
233  * were problematic for clients of HConnection that wanted to register their
234  * own shutdown hooks so we removed ours though this shifts the onus for
235  * cleanup to the client.
236  */
237 @SuppressWarnings("serial")
238 @InterfaceAudience.Public
239 @InterfaceStability.Evolving
240 public class HConnectionManager {
241   static final Log LOG = LogFactory.getLog(HConnectionManager.class);
242 
243   public static final String RETRIES_BY_SERVER_KEY = "hbase.client.retries.by.server";
244   private static final String CLIENT_NONCES_ENABLED_KEY = "hbase.client.nonces.enabled";
245 
246   // An LRU Map of HConnectionKey -> HConnection (TableServer).  All
247   // access must be synchronized.  This map is not private because tests
248   // need to be able to tinker with it.
249   static final Map<HConnectionKey, HConnectionImplementation> CONNECTION_INSTANCES;
250 
251   public static final int MAX_CACHED_CONNECTION_INSTANCES;
252 
253   /**
254    * Global nonceGenerator shared per client.Currently there's no reason to limit its scope.
255    * Once it's set under nonceGeneratorCreateLock, it is never unset or changed.
256    */
257   private static volatile NonceGenerator nonceGenerator = null;
258   /** The nonce generator lock. Only taken when creating HConnection, which gets a private copy. */
259   private static Object nonceGeneratorCreateLock = new Object();
260 
261   static {
262     // We set instances to one more than the value specified for {@link
263     // HConstants#ZOOKEEPER_MAX_CLIENT_CNXNS}. By default, the zk default max
264     // connections to the ensemble from the one client is 30, so in that case we
265     // should run into zk issues before the LRU hit this value of 31.
266     MAX_CACHED_CONNECTION_INSTANCES = HBaseConfiguration.create().getInt(
267       HConstants.ZOOKEEPER_MAX_CLIENT_CNXNS, HConstants.DEFAULT_ZOOKEPER_MAX_CLIENT_CNXNS) + 1;
268     CONNECTION_INSTANCES = new LinkedHashMap<HConnectionKey, HConnectionImplementation>(
269         (int) (MAX_CACHED_CONNECTION_INSTANCES / 0.75F) + 1, 0.75F, true) {
270       @Override
271       protected boolean removeEldestEntry(
272           Map.Entry<HConnectionKey, HConnectionImplementation> eldest) {
273          return size() > MAX_CACHED_CONNECTION_INSTANCES;
274        }
275     };
276   }
277 
278   /*
279    * Non-instantiable.
280    */
281   private HConnectionManager() {
282     super();
283   }
284 
285   /**
286    * @param conn The connection for which to replace the generator.
287    * @param cnm Replaces the nonce generator used, for testing.
288    * @return old nonce generator.
289    */
290   @VisibleForTesting
291   public static NonceGenerator injectNonceGeneratorForTesting(
292       HConnection conn, NonceGenerator cnm) {
293     NonceGenerator ng = conn.getNonceGenerator();
294     LOG.warn("Nonce generator is being replaced by test code for " + cnm.getClass().getName());
295     ((HConnectionImplementation)conn).nonceGenerator = cnm;
296     return ng;
297   }
298 
299   /**
300    * Get the connection that goes with the passed <code>conf</code> configuration instance.
301    * If no current connection exists, method creates a new connection and keys it using
302    * connection-specific properties from the passed {@link Configuration}; see
303    * {@link HConnectionKey}.
304    * @param conf configuration
305    * @return HConnection object for <code>conf</code>
306    * @throws ZooKeeperConnectionException
307    */
308   @Deprecated
309   public static HConnection getConnection(final Configuration conf)
310   throws IOException {
311     HConnectionKey connectionKey = new HConnectionKey(conf);
312     synchronized (CONNECTION_INSTANCES) {
313       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
314       if (connection == null) {
315         connection = (HConnectionImplementation)createConnection(conf, true);
316         CONNECTION_INSTANCES.put(connectionKey, connection);
317       } else if (connection.isClosed()) {
318         HConnectionManager.deleteConnection(connectionKey, true);
319         connection = (HConnectionImplementation)createConnection(conf, true);
320         CONNECTION_INSTANCES.put(connectionKey, connection);
321       }
322       connection.incCount();
323       return connection;
324     }
325   }
326 
327   /**
328    * Create a new HConnection instance using the passed <code>conf</code> instance.
329    * <p>Note: This bypasses the usual HConnection life cycle management done by
330    * {@link #getConnection(Configuration)}. The caller is responsible for
331    * calling {@link HConnection#close()} on the returned connection instance.
332    *
333    * This is the recommended way to create HConnections.
334    * {@code
335    * HConnection connection = HConnectionManager.createConnection(conf);
336    * HTableInterface table = connection.getTable("mytable");
337    * table.get(...);
338    * ...
339    * table.close();
340    * connection.close();
341    * }
342    *
343    * @param conf configuration
344    * @return HConnection object for <code>conf</code>
345    * @throws ZooKeeperConnectionException
346    */
347   public static HConnection createConnection(Configuration conf)
348   throws IOException {
349     UserProvider provider = UserProvider.instantiate(conf);
350     return createConnection(conf, false, null, provider.getCurrent());
351   }
352 
353   /**
354    * Create a new HConnection instance using the passed <code>conf</code> instance.
355    * <p>Note: This bypasses the usual HConnection life cycle management done by
356    * {@link #getConnection(Configuration)}. The caller is responsible for
357    * calling {@link HConnection#close()} on the returned connection instance.
358    * This is the recommended way to create HConnections.
359    * {@code
360    * ExecutorService pool = ...;
361    * HConnection connection = HConnectionManager.createConnection(conf, pool);
362    * HTableInterface table = connection.getTable("mytable");
363    * table.get(...);
364    * ...
365    * table.close();
366    * connection.close();
367    * }
368    * @param conf configuration
369    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
370    * @return HConnection object for <code>conf</code>
371    * @throws ZooKeeperConnectionException
372    */
373   public static HConnection createConnection(Configuration conf, ExecutorService pool)
374   throws IOException {
375     UserProvider provider = UserProvider.instantiate(conf);
376     return createConnection(conf, false, pool, provider.getCurrent());
377   }
378 
379   /**
380    * Create a new HConnection instance using the passed <code>conf</code> instance.
381    * <p>Note: This bypasses the usual HConnection life cycle management done by
382    * {@link #getConnection(Configuration)}. The caller is responsible for
383    * calling {@link HConnection#close()} on the returned connection instance.
384    * This is the recommended way to create HConnections.
385    * {@code
386    * ExecutorService pool = ...;
387    * HConnection connection = HConnectionManager.createConnection(conf, pool);
388    * HTableInterface table = connection.getTable("mytable");
389    * table.get(...);
390    * ...
391    * table.close();
392    * connection.close();
393    * }
394    * @param conf configuration
395    * @param user the user the connection is for
396    * @return HConnection object for <code>conf</code>
397    * @throws ZooKeeperConnectionException
398    */
399   public static HConnection createConnection(Configuration conf, User user)
400   throws IOException {
401     return createConnection(conf, false, null, user);
402   }
403 
404   /**
405    * Create a new HConnection instance using the passed <code>conf</code> instance.
406    * <p>Note: This bypasses the usual HConnection life cycle management done by
407    * {@link #getConnection(Configuration)}. The caller is responsible for
408    * calling {@link HConnection#close()} on the returned connection instance.
409    * This is the recommended way to create HConnections.
410    * {@code
411    * ExecutorService pool = ...;
412    * HConnection connection = HConnectionManager.createConnection(conf, pool);
413    * HTableInterface table = connection.getTable("mytable");
414    * table.get(...);
415    * ...
416    * table.close();
417    * connection.close();
418    * }
419    * @param conf configuration
420    * @param pool the thread pool to use for batch operation in HTables used via this HConnection
421    * @param user the user the connection is for
422    * @return HConnection object for <code>conf</code>
423    * @throws ZooKeeperConnectionException
424    */
425   public static HConnection createConnection(Configuration conf, ExecutorService pool, User user)
426   throws IOException {
427     return createConnection(conf, false, pool, user);
428   }
429 
430   @Deprecated
431   static HConnection createConnection(final Configuration conf, final boolean managed)
432       throws IOException {
433     UserProvider provider = UserProvider.instantiate(conf);
434     return createConnection(conf, managed, null, provider.getCurrent());
435   }
436 
437   @Deprecated
438   static HConnection createConnection(final Configuration conf, final boolean managed,
439       final ExecutorService pool, final User user)
440   throws IOException {
441     String className = conf.get("hbase.client.connection.impl",
442       HConnectionManager.HConnectionImplementation.class.getName());
443     Class<?> clazz = null;
444     try {
445       clazz = Class.forName(className);
446     } catch (ClassNotFoundException e) {
447       throw new IOException(e);
448     }
449     try {
450       // Default HCM#HCI is not accessible; make it so before invoking.
451       Constructor<?> constructor =
452         clazz.getDeclaredConstructor(Configuration.class,
453           boolean.class, ExecutorService.class, User.class);
454       constructor.setAccessible(true);
455       return (HConnection) constructor.newInstance(conf, managed, pool, user);
456     } catch (Exception e) {
457       throw new IOException(e);
458     }
459   }
460 
461   /**
462    * Delete connection information for the instance specified by passed configuration.
463    * If there are no more references to the designated connection connection, this method will
464    * then close connection to the zookeeper ensemble and let go of all associated resources.
465    *
466    * @param conf configuration whose identity is used to find {@link HConnection} instance.
467    * @deprecated
468    */
469   public static void deleteConnection(Configuration conf) {
470     deleteConnection(new HConnectionKey(conf), false);
471   }
472 
473   /**
474    * Cleanup a known stale connection.
475    * This will then close connection to the zookeeper ensemble and let go of all resources.
476    *
477    * @param connection
478    * @deprecated
479    */
480   public static void deleteStaleConnection(HConnection connection) {
481     deleteConnection(connection, true);
482   }
483 
484   /**
485    * Delete information for all connections. Close or not the connection, depending on the
486    *  staleConnection boolean and the ref count. By default, you should use it with
487    *  staleConnection to true.
488    * @deprecated
489    */
490   public static void deleteAllConnections(boolean staleConnection) {
491     synchronized (CONNECTION_INSTANCES) {
492       Set<HConnectionKey> connectionKeys = new HashSet<HConnectionKey>();
493       connectionKeys.addAll(CONNECTION_INSTANCES.keySet());
494       for (HConnectionKey connectionKey : connectionKeys) {
495         deleteConnection(connectionKey, staleConnection);
496       }
497       CONNECTION_INSTANCES.clear();
498     }
499   }
500 
501   /**
502    * Delete information for all connections..
503    * @deprecated kept for backward compatibility, but the behavior is broken. HBASE-8983
504    */
505   @Deprecated
506   public static void deleteAllConnections() {
507     deleteAllConnections(false);
508   }
509 
510 
511   @Deprecated
512   private static void deleteConnection(HConnection connection, boolean staleConnection) {
513     synchronized (CONNECTION_INSTANCES) {
514       for (Entry<HConnectionKey, HConnectionImplementation> e: CONNECTION_INSTANCES.entrySet()) {
515         if (e.getValue() == connection) {
516           deleteConnection(e.getKey(), staleConnection);
517           break;
518         }
519       }
520     }
521   }
522 
523   @Deprecated
524   private static void deleteConnection(HConnectionKey connectionKey, boolean staleConnection) {
525     synchronized (CONNECTION_INSTANCES) {
526       HConnectionImplementation connection = CONNECTION_INSTANCES.get(connectionKey);
527       if (connection != null) {
528         connection.decCount();
529         if (connection.isZeroReference() || staleConnection) {
530           CONNECTION_INSTANCES.remove(connectionKey);
531           connection.internalClose();
532         }
533       } else {
534         LOG.error("Connection not found in the list, can't delete it "+
535           "(connection key=" + connectionKey + "). May be the key was modified?", new Exception());
536       }
537     }
538   }
539 
540   /**
541    * It is provided for unit test cases which verify the behavior of region
542    * location cache prefetch.
543    * @return Number of cached regions for the table.
544    * @throws ZooKeeperConnectionException
545    */
546   static int getCachedRegionCount(Configuration conf, final TableName tableName)
547   throws IOException {
548     return execute(new HConnectable<Integer>(conf) {
549       @Override
550       public Integer connect(HConnection connection) {
551         return ((HConnectionImplementation)connection).getNumberOfCachedRegionLocations(tableName);
552       }
553     });
554   }
555 
556   /**
557    * This convenience method invokes the given {@link HConnectable#connect}
558    * implementation using a {@link HConnection} instance that lasts just for the
559    * duration of the invocation.
560    *
561    * @param <T> the return type of the connect method
562    * @param connectable the {@link HConnectable} instance
563    * @return the value returned by the connect method
564    * @throws IOException
565    */
566   @InterfaceAudience.Private
567   public static <T> T execute(HConnectable<T> connectable) throws IOException {
568     if (connectable == null || connectable.conf == null) {
569       return null;
570     }
571     Configuration conf = connectable.conf;
572     HConnection connection = HConnectionManager.getConnection(conf);
573     boolean connectSucceeded = false;
574     try {
575       T returnValue = connectable.connect(connection);
576       connectSucceeded = true;
577       return returnValue;
578     } finally {
579       try {
580         connection.close();
581       } catch (Exception e) {
582         ExceptionUtil.rethrowIfInterrupt(e);
583         if (connectSucceeded) {
584           throw new IOException("The connection to " + connection
585               + " could not be deleted.", e);
586         }
587       }
588     }
589   }
590 
591   /** Encapsulates connection to zookeeper and regionservers.*/
592   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
593       value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
594       justification="Access to the conncurrent hash map is under a lock so should be fine.")
595   public static class HConnectionImplementation implements HConnection, Closeable {
596     static final Log LOG = LogFactory.getLog(HConnectionImplementation.class);
597     private final long pause;
598     private final int numTries;
599     final int rpcTimeout;
600     private NonceGenerator nonceGenerator = null;
601     private final boolean usePrefetch;
602     private final int prefetchRegionLimit;
603 
604     private volatile boolean closed;
605     private volatile boolean aborted;
606 
607     // package protected for the tests
608     ClusterStatusListener clusterStatusListener;
609 
610     private final Object userRegionLock = new Object();
611 
612     // We have a single lock for master & zk to prevent deadlocks. Having
613     //  one lock for ZK and one lock for master is not possible:
614     //  When creating a connection to master, we need a connection to ZK to get
615     //  its address. But another thread could have taken the ZK lock, and could
616     //  be waiting for the master lock => deadlock.
617     private final Object masterAndZKLock = new Object();
618 
619     private long keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
620     private final DelayedClosing delayedClosing =
621       DelayedClosing.createAndStart(this);
622 
623     // thread executor shared by all HTableInterface instances created
624     // by this connection
625     private volatile ExecutorService batchPool = null;
626     private volatile boolean cleanupPool = false;
627 
628     private final Configuration conf;
629 
630     // cache the configuration value for tables so that we can avoid calling
631     // the expensive Configuration to fetch the value multiple times.
632     private final TableConfiguration tableConfig;
633 
634     // Client rpc instance.
635     private RpcClient rpcClient;
636 
637     /**
638       * Map of table to table {@link HRegionLocation}s.
639       */
640     private final ConcurrentMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>
641         cachedRegionLocations =
642       new ConcurrentHashMap<TableName, ConcurrentSkipListMap<byte[], HRegionLocation>>();
643 
644     // The presence of a server in the map implies it's likely that there is an
645     // entry in cachedRegionLocations that map to this server; but the absence
646     // of a server in this map guarentees that there is no entry in cache that
647     // maps to the absent server.
648     // The access to this attribute must be protected by a lock on cachedRegionLocations
649     private final Set<ServerName> cachedServers = new ConcurrentSkipListSet<ServerName>();
650 
651     // region cache prefetch is enabled by default. this set contains all
652     // tables whose region cache prefetch are disabled.
653     private final Set<Integer> regionCachePrefetchDisabledTables =
654       new CopyOnWriteArraySet<Integer>();
655 
656     private int refCount;
657 
658     // indicates whether this connection's life cycle is managed (by us)
659     private boolean managed;
660 
661     private User user;
662 
663     private RpcRetryingCallerFactory rpcCallerFactory;
664 
665     private RpcControllerFactory rpcControllerFactory;
666 
667     /**
668      * Cluster registry of basic info such as clusterid and meta region location.
669      */
670      Registry registry;
671 
672      HConnectionImplementation(Configuration conf, boolean managed) throws IOException {
673        this(conf, managed, null, null);
674      }
675 
676     /**
677      * constructor
678      * @param conf Configuration object
679      * @param managed If true, does not do full shutdown on close; i.e. cleanup of connection
680      * to zk and shutdown of all services; we just close down the resources this connection was
681      * responsible for and decrement usage counters.  It is up to the caller to do the full
682      * cleanup.  It is set when we want have connection sharing going on -- reuse of zk connection,
683      * and cached region locations, established regionserver connections, etc.  When connections
684      * are shared, we have reference counting going on and will only do full cleanup when no more
685      * users of an HConnectionImplementation instance.
686      */
687     HConnectionImplementation(Configuration conf, boolean managed,
688         ExecutorService pool, User user) throws IOException {
689       this(conf);
690       this.user = user;
691       this.batchPool = pool;
692       this.managed = managed;
693       this.registry = setupRegistry();
694       retrieveClusterId();
695 
696       this.rpcClient = new RpcClient(this.conf, this.clusterId);
697 
698       // Do we publish the status?
699       boolean shouldListen = conf.getBoolean(HConstants.STATUS_PUBLISHED,
700           HConstants.STATUS_PUBLISHED_DEFAULT);
701       Class<? extends ClusterStatusListener.Listener> listenerClass =
702           conf.getClass(ClusterStatusListener.STATUS_LISTENER_CLASS,
703               ClusterStatusListener.DEFAULT_STATUS_LISTENER_CLASS,
704               ClusterStatusListener.Listener.class);
705       if (shouldListen) {
706         if (listenerClass == null) {
707           LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
708               ClusterStatusListener.STATUS_LISTENER_CLASS + " is not set - not listening status");
709         } else {
710           clusterStatusListener = new ClusterStatusListener(
711               new ClusterStatusListener.DeadServerHandler() {
712                 @Override
713                 public void newDead(ServerName sn) {
714                   clearCaches(sn);
715                   rpcClient.cancelConnections(sn.getHostname(), sn.getPort(),
716                       new SocketException(sn.getServerName() +
717                           " is dead: closing its connection."));
718                 }
719               }, conf, listenerClass);
720         }
721       }
722 
723       this.rpcCallerFactory = RpcRetryingCallerFactory.instantiate(conf);
724       this.rpcControllerFactory = RpcControllerFactory.instantiate(conf);
725     }
726 
727     /** Dummy nonce generator for disabled nonces. */
728     private static class NoNonceGenerator implements NonceGenerator {
729       @Override
730       public long getNonceGroup() {
731         return HConstants.NO_NONCE;
732       }
733       @Override
734       public long newNonce() {
735         return HConstants.NO_NONCE;
736       }
737     }
738 
739     /**
740      * For tests.
741      */
742     protected HConnectionImplementation(Configuration conf) {
743       this.conf = conf;
744       this.tableConfig = new TableConfiguration(conf);
745       this.closed = false;
746       this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
747           HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
748       this.numTries = tableConfig.getRetriesNumber();
749       this.rpcTimeout = conf.getInt(
750           HConstants.HBASE_RPC_TIMEOUT_KEY,
751           HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
752       if (conf.getBoolean(CLIENT_NONCES_ENABLED_KEY, true)) {
753         synchronized (HConnectionManager.nonceGeneratorCreateLock) {
754           if (HConnectionManager.nonceGenerator == null) {
755             HConnectionManager.nonceGenerator = new PerClientRandomNonceGenerator();
756           }
757           this.nonceGenerator = HConnectionManager.nonceGenerator;
758         }
759       } else {
760         this.nonceGenerator = new NoNonceGenerator();
761       }
762 
763       this.usePrefetch = conf.getBoolean(HConstants.HBASE_CLIENT_PREFETCH,
764           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH);
765       this.prefetchRegionLimit = conf.getInt(
766           HConstants.HBASE_CLIENT_PREFETCH_LIMIT,
767           HConstants.DEFAULT_HBASE_CLIENT_PREFETCH_LIMIT);
768       this.rpcCallerFactory = RpcRetryingCallerFactory.instantiate(conf);
769       this.rpcControllerFactory = RpcControllerFactory.instantiate(conf);
770     }
771 
772     @Override
773     public HTableInterface getTable(String tableName) throws IOException {
774       return getTable(TableName.valueOf(tableName));
775     }
776 
777     @Override
778     public HTableInterface getTable(byte[] tableName) throws IOException {
779       return getTable(TableName.valueOf(tableName));
780     }
781 
782     @Override
783     public HTableInterface getTable(TableName tableName) throws IOException {
784       return getTable(tableName, getBatchPool());
785     }
786 
787     @Override
788     public HTableInterface getTable(String tableName, ExecutorService pool) throws IOException {
789       return getTable(TableName.valueOf(tableName), pool);
790     }
791 
792     @Override
793     public HTableInterface getTable(byte[] tableName, ExecutorService pool) throws IOException {
794       return getTable(TableName.valueOf(tableName), pool);
795     }
796 
797     @Override
798     public HTableInterface getTable(TableName tableName, ExecutorService pool) throws IOException {
799       if (managed) {
800         throw new IOException("The connection has to be unmanaged.");
801       }
802       return new HTable(tableName, this, tableConfig, rpcCallerFactory, rpcControllerFactory,
803         pool);
804     }
805 
806     private ExecutorService getBatchPool() {
807       if (batchPool == null) {
808         // shared HTable thread executor not yet initialized
809         synchronized (this) {
810           if (batchPool == null) {
811             int maxThreads = conf.getInt("hbase.hconnection.threads.max", 256);
812             int coreThreads = conf.getInt("hbase.hconnection.threads.core", 256);
813             if (maxThreads == 0) {
814               maxThreads = Runtime.getRuntime().availableProcessors() * 8;
815             }
816             if (coreThreads == 0) {
817               coreThreads = Runtime.getRuntime().availableProcessors() * 8;
818             }
819             long keepAliveTime = conf.getLong("hbase.hconnection.threads.keepalivetime", 60);
820             LinkedBlockingQueue<Runnable> workQueue =
821               new LinkedBlockingQueue<Runnable>(maxThreads *
822                 conf.getInt(HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS,
823                   HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS));
824             ThreadPoolExecutor tpe = new ThreadPoolExecutor(
825                 coreThreads,
826                 maxThreads,
827                 keepAliveTime,
828                 TimeUnit.SECONDS,
829                 workQueue,
830                 Threads.newDaemonThreadFactory(toString() + "-shared-"));
831             tpe.allowCoreThreadTimeOut(true);
832             this.batchPool = tpe;
833           }
834           this.cleanupPool = true;
835         }
836       }
837       return this.batchPool;
838     }
839 
840     protected ExecutorService getCurrentBatchPool() {
841       return batchPool;
842     }
843 
844     private void shutdownBatchPool() {
845       if (this.cleanupPool && this.batchPool != null && !this.batchPool.isShutdown()) {
846         this.batchPool.shutdown();
847         try {
848           if (!this.batchPool.awaitTermination(10, TimeUnit.SECONDS)) {
849             this.batchPool.shutdownNow();
850           }
851         } catch (InterruptedException e) {
852           this.batchPool.shutdownNow();
853         }
854       }
855     }
856 
857     /**
858      * @return The cluster registry implementation to use.
859      * @throws IOException
860      */
861     private Registry setupRegistry() throws IOException {
862       String registryClass = this.conf.get("hbase.client.registry.impl",
863         ZooKeeperRegistry.class.getName());
864       Registry registry = null;
865       try {
866         registry = (Registry)Class.forName(registryClass).newInstance();
867       } catch (Throwable t) {
868         throw new IOException(t);
869       }
870       registry.init(this);
871       return registry;
872     }
873 
874     /**
875      * For tests only.
876      * @param rpcClient Client we should use instead.
877      * @return Previous rpcClient
878      */
879     RpcClient setRpcClient(final RpcClient rpcClient) {
880       RpcClient oldRpcClient = this.rpcClient;
881       this.rpcClient = rpcClient;
882       return oldRpcClient;
883     }
884 
885     /**
886      * An identifier that will remain the same for a given connection.
887      * @return
888      */
889     public String toString(){
890       return "hconnection-0x" + Integer.toHexString(hashCode());
891     }
892 
893     protected String clusterId = null;
894 
895     void retrieveClusterId() {
896       if (clusterId != null) return;
897       this.clusterId = this.registry.getClusterId();
898       if (clusterId == null) {
899         clusterId = HConstants.CLUSTER_ID_DEFAULT;
900         LOG.debug("clusterid came back null, using default " + clusterId);
901       }
902     }
903 
904     @Override
905     public Configuration getConfiguration() {
906       return this.conf;
907     }
908 
909     private void checkIfBaseNodeAvailable(ZooKeeperWatcher zkw)
910       throws MasterNotRunningException {
911       String errorMsg;
912       try {
913         if (ZKUtil.checkExists(zkw, zkw.baseZNode) == -1) {
914           errorMsg = "The node " + zkw.baseZNode+" is not in ZooKeeper. "
915             + "It should have been written by the master. "
916             + "Check the value configured in 'zookeeper.znode.parent'. "
917             + "There could be a mismatch with the one configured in the master.";
918           LOG.error(errorMsg);
919           throw new MasterNotRunningException(errorMsg);
920         }
921       } catch (KeeperException e) {
922         errorMsg = "Can't get connection to ZooKeeper: " + e.getMessage();
923         LOG.error(errorMsg);
924         throw new MasterNotRunningException(errorMsg, e);
925       }
926     }
927 
928     /**
929      * @return true if the master is running, throws an exception otherwise
930      * @throws MasterNotRunningException - if the master is not running
931      * @throws ZooKeeperConnectionException
932      */
933     @Override
934     public boolean isMasterRunning()
935     throws MasterNotRunningException, ZooKeeperConnectionException {
936       // When getting the master connection, we check it's running,
937       // so if there is no exception, it means we've been able to get a
938       // connection on a running master
939       MasterKeepAliveConnection m = getKeepAliveMasterService();
940       m.close();
941       return true;
942     }
943 
944     @Override
945     public HRegionLocation getRegionLocation(final TableName tableName,
946         final byte [] row, boolean reload)
947     throws IOException {
948       return reload? relocateRegion(tableName, row): locateRegion(tableName, row);
949     }
950 
951     @Override
952     public HRegionLocation getRegionLocation(final byte[] tableName,
953         final byte [] row, boolean reload)
954     throws IOException {
955       return getRegionLocation(TableName.valueOf(tableName), row, reload);
956     }
957 
958     @Override
959     public boolean isTableEnabled(TableName tableName) throws IOException {
960       return this.registry.isTableOnlineState(tableName, true);
961     }
962 
963     @Override
964     public boolean isTableEnabled(byte[] tableName) throws IOException {
965       return isTableEnabled(TableName.valueOf(tableName));
966     }
967 
968     @Override
969     public boolean isTableDisabled(TableName tableName) throws IOException {
970       return this.registry.isTableOnlineState(tableName, false);
971     }
972 
973     @Override
974     public boolean isTableDisabled(byte[] tableName) throws IOException {
975       return isTableDisabled(TableName.valueOf(tableName));
976     }
977 
978     @Override
979     public boolean isTableAvailable(final TableName tableName) throws IOException {
980       final AtomicBoolean available = new AtomicBoolean(true);
981       final AtomicInteger regionCount = new AtomicInteger(0);
982       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
983         @Override
984         public boolean processRow(Result row) throws IOException {
985           HRegionInfo info = MetaScanner.getHRegionInfo(row);
986           if (info != null && !info.isSplitParent()) {
987             if (tableName.equals(info.getTable())) {
988               ServerName server = HRegionInfo.getServerName(row);
989               if (server == null) {
990                 available.set(false);
991                 return false;
992               }
993               regionCount.incrementAndGet();
994             } else if (tableName.compareTo(info.getTable()) < 0) {
995               // Return if we are done with the current table
996               return false;
997             }
998           }
999           return true;
1000         }
1001       };
1002       MetaScanner.metaScan(conf, this, visitor, tableName);
1003       return available.get() && (regionCount.get() > 0);
1004     }
1005 
1006     @Override
1007     public boolean isTableAvailable(final byte[] tableName) throws IOException {
1008       return isTableAvailable(TableName.valueOf(tableName));
1009     }
1010 
1011     @Override
1012     public boolean isTableAvailable(final TableName tableName, final byte[][] splitKeys)
1013         throws IOException {
1014       final AtomicBoolean available = new AtomicBoolean(true);
1015       final AtomicInteger regionCount = new AtomicInteger(0);
1016       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1017         @Override
1018         public boolean processRow(Result row) throws IOException {
1019           HRegionInfo info = MetaScanner.getHRegionInfo(row);
1020           if (info != null && !info.isSplitParent()) {
1021             if (tableName.equals(info.getTable())) {
1022               ServerName server = HRegionInfo.getServerName(row);
1023               if (server == null) {
1024                 available.set(false);
1025                 return false;
1026               }
1027               if (!Bytes.equals(info.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
1028                 for (byte[] splitKey : splitKeys) {
1029                   // Just check if the splitkey is available
1030                   if (Bytes.equals(info.getStartKey(), splitKey)) {
1031                     regionCount.incrementAndGet();
1032                     break;
1033                   }
1034                 }
1035               } else {
1036                 // Always empty start row should be counted
1037                 regionCount.incrementAndGet();
1038               }
1039             } else if (tableName.compareTo(info.getTable()) < 0) {
1040               // Return if we are done with the current table
1041               return false;
1042             }
1043           }
1044           return true;
1045         }
1046       };
1047       MetaScanner.metaScan(conf, this, visitor, tableName);
1048       // +1 needs to be added so that the empty start row is also taken into account
1049       return available.get() && (regionCount.get() == splitKeys.length + 1);
1050     }
1051 
1052     @Override
1053     public boolean isTableAvailable(final byte[] tableName, final byte[][] splitKeys)
1054         throws IOException {
1055       return isTableAvailable(TableName.valueOf(tableName), splitKeys);
1056     }
1057 
1058     @Override
1059     public HRegionLocation locateRegion(final byte[] regionName) throws IOException {
1060       return locateRegion(HRegionInfo.getTable(regionName),
1061           HRegionInfo.getStartKey(regionName), false, true);
1062     }
1063 
1064     @Override
1065     public boolean isDeadServer(ServerName sn) {
1066       if (clusterStatusListener == null) {
1067         return false;
1068       } else {
1069         return clusterStatusListener.isDeadServer(sn);
1070       }
1071     }
1072 
1073     @Override
1074     public List<HRegionLocation> locateRegions(final TableName tableName)
1075     throws IOException {
1076       return locateRegions (tableName, false, true);
1077     }
1078 
1079     @Override
1080     public List<HRegionLocation> locateRegions(final byte[] tableName)
1081     throws IOException {
1082       return locateRegions(TableName.valueOf(tableName));
1083     }
1084 
1085     @Override
1086     public List<HRegionLocation> locateRegions(final TableName tableName,
1087         final boolean useCache, final boolean offlined) throws IOException {
1088       NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(conf, this,
1089           tableName, offlined);
1090       final List<HRegionLocation> locations = new ArrayList<HRegionLocation>();
1091       for (HRegionInfo regionInfo : regions.keySet()) {
1092         locations.add(locateRegion(tableName, regionInfo.getStartKey(), useCache, true));
1093       }
1094       return locations;
1095     }
1096 
1097     @Override
1098     public List<HRegionLocation> locateRegions(final byte[] tableName,
1099        final boolean useCache, final boolean offlined) throws IOException {
1100       return locateRegions(TableName.valueOf(tableName), useCache, offlined);
1101     }
1102 
1103     @Override
1104     public HRegionLocation locateRegion(final TableName tableName,
1105         final byte [] row)
1106     throws IOException{
1107       return locateRegion(tableName, row, true, true);
1108     }
1109 
1110     @Override
1111     public HRegionLocation locateRegion(final byte[] tableName,
1112         final byte [] row)
1113     throws IOException{
1114       return locateRegion(TableName.valueOf(tableName), row);
1115     }
1116 
1117     @Override
1118     public HRegionLocation relocateRegion(final TableName tableName,
1119         final byte [] row) throws IOException{
1120       // Since this is an explicit request not to use any caching, finding
1121       // disabled tables should not be desirable.  This will ensure that an exception is thrown when
1122       // the first time a disabled table is interacted with.
1123       if (isTableDisabled(tableName)) {
1124         throw new TableNotEnabledException(tableName.getNameAsString() + " is disabled.");
1125       }
1126 
1127       return locateRegion(tableName, row, false, true);
1128     }
1129 
1130     @Override
1131     public HRegionLocation relocateRegion(final byte[] tableName,
1132         final byte [] row) throws IOException {
1133       return relocateRegion(TableName.valueOf(tableName), row);
1134     }
1135 
1136 
1137     private HRegionLocation locateRegion(final TableName tableName,
1138       final byte [] row, boolean useCache, boolean retry)
1139     throws IOException {
1140       if (this.closed) throw new IOException(toString() + " closed");
1141       if (tableName== null || tableName.getName().length == 0) {
1142         throw new IllegalArgumentException(
1143             "table name cannot be null or zero length");
1144       }
1145 
1146       if (tableName.equals(TableName.META_TABLE_NAME)) {
1147         return this.registry.getMetaRegionLocation();
1148       } else {
1149         // Region not in the cache - have to go to the meta RS
1150         return locateRegionInMeta(TableName.META_TABLE_NAME, tableName, row,
1151           useCache, userRegionLock, retry);
1152       }
1153     }
1154 
1155     /*
1156      * Search hbase:meta for the HRegionLocation info that contains the table and
1157      * row we're seeking. It will prefetch certain number of regions info and
1158      * save them to the global region cache.
1159      */
1160     private void prefetchRegionCache(final TableName tableName,
1161         final byte[] row) {
1162       // Implement a new visitor for MetaScanner, and use it to walk through
1163       // the hbase:meta
1164       MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
1165         public boolean processRow(Result result) throws IOException {
1166           try {
1167             HRegionInfo regionInfo = MetaScanner.getHRegionInfo(result);
1168             if (regionInfo == null) {
1169               return true;
1170             }
1171 
1172             // possible we got a region of a different table...
1173             if (!regionInfo.getTable().equals(tableName)) {
1174               return false; // stop scanning
1175             }
1176             if (regionInfo.isOffline()) {
1177               // don't cache offline regions
1178               return true;
1179             }
1180 
1181             ServerName serverName = HRegionInfo.getServerName(result);
1182             if (serverName == null) {
1183               return true; // don't cache it
1184             }
1185             // instantiate the location
1186             long seqNum = HRegionInfo.getSeqNumDuringOpen(result);
1187             HRegionLocation loc = new HRegionLocation(regionInfo, serverName, seqNum);
1188             // cache this meta entry
1189             cacheLocation(tableName, null, loc);
1190             return true;
1191           } catch (RuntimeException e) {
1192             throw new IOException(e);
1193           }
1194         }
1195       };
1196       try {
1197         // pre-fetch certain number of regions info at region cache.
1198         MetaScanner.metaScan(conf, this, visitor, tableName, row,
1199             this.prefetchRegionLimit, TableName.META_TABLE_NAME);
1200       } catch (IOException e) {
1201         if (ExceptionUtil.isInterrupt(e)) {
1202           Thread.currentThread().interrupt();
1203         }
1204       }
1205     }
1206 
1207     /*
1208       * Search the hbase:meta table for the HRegionLocation
1209       * info that contains the table and row we're seeking.
1210       */
1211     private HRegionLocation locateRegionInMeta(final TableName parentTable,
1212       final TableName tableName, final byte [] row, boolean useCache,
1213       Object regionLockObject, boolean retry)
1214     throws IOException {
1215       HRegionLocation location;
1216       // If we are supposed to be using the cache, look in the cache to see if
1217       // we already have the region.
1218       if (useCache) {
1219         location = getCachedLocation(tableName, row);
1220         if (location != null) {
1221           return location;
1222         }
1223       }
1224       int localNumRetries = retry ? numTries : 1;
1225       // build the key of the meta region we should be looking for.
1226       // the extra 9's on the end are necessary to allow "exact" matches
1227       // without knowing the precise region names.
1228       byte [] metaKey = HRegionInfo.createRegionName(tableName, row,
1229         HConstants.NINES, false);
1230       for (int tries = 0; true; tries++) {
1231         if (tries >= localNumRetries) {
1232           throw new NoServerForRegionException("Unable to find region for "
1233             + Bytes.toStringBinary(row) + " after " + numTries + " tries.");
1234         }
1235 
1236         HRegionLocation metaLocation = null;
1237         try {
1238           // locate the meta region
1239           metaLocation = locateRegion(parentTable, metaKey, true, false);
1240           // If null still, go around again.
1241           if (metaLocation == null) continue;
1242           ClientService.BlockingInterface service = getClient(metaLocation.getServerName());
1243 
1244           Result regionInfoRow;
1245           // This block guards against two threads trying to load the meta
1246           // region at the same time. The first will load the meta region and
1247           // the second will use the value that the first one found.
1248           if (useCache) {
1249             if (TableName.META_TABLE_NAME.equals(parentTable) && usePrefetch &&
1250                 getRegionCachePrefetch(tableName)) {
1251               synchronized (regionLockObject) {
1252                 // Check the cache again for a hit in case some other thread made the
1253                 // same query while we were waiting on the lock.
1254                 location = getCachedLocation(tableName, row);
1255                 if (location != null) {
1256                   return location;
1257                 }
1258                 // If the parent table is META, we may want to pre-fetch some
1259                 // region info into the global region cache for this table.
1260                 prefetchRegionCache(tableName, row);
1261               }
1262             }
1263             location = getCachedLocation(tableName, row);
1264             if (location != null) {
1265               return location;
1266             }
1267           } else {
1268             // If we are not supposed to be using the cache, delete any existing cached location
1269             // so it won't interfere.
1270             forceDeleteCachedLocation(tableName, row);
1271           }
1272 
1273           // Query the meta region for the location of the meta region
1274           regionInfoRow =
1275               ProtobufUtil.getRowOrBefore(service, metaLocation.getRegionInfo().getRegionName(),
1276                 metaKey, HConstants.CATALOG_FAMILY);
1277 
1278           if (regionInfoRow == null) {
1279             throw new TableNotFoundException(tableName);
1280           }
1281 
1282           // convert the row result into the HRegionLocation we need!
1283           HRegionInfo regionInfo = MetaScanner.getHRegionInfo(regionInfoRow);
1284           if (regionInfo == null) {
1285             throw new IOException("HRegionInfo was null or empty in " +
1286               parentTable + ", row=" + regionInfoRow);
1287           }
1288 
1289           // possible we got a region of a different table...
1290           if (!regionInfo.getTable().equals(tableName)) {
1291             throw new TableNotFoundException(
1292                   "Table '" + tableName + "' was not found, got: " +
1293                   regionInfo.getTable() + ".");
1294           }
1295           if (regionInfo.isSplit()) {
1296             throw new RegionOfflineException("the only available region for" +
1297               " the required row is a split parent," +
1298               " the daughters should be online soon: " +
1299               regionInfo.getRegionNameAsString());
1300           }
1301           if (regionInfo.isOffline()) {
1302             throw new RegionOfflineException("the region is offline, could" +
1303               " be caused by a disable table call: " +
1304               regionInfo.getRegionNameAsString());
1305           }
1306 
1307           ServerName serverName = HRegionInfo.getServerName(regionInfoRow);
1308           if (serverName == null) {
1309             throw new NoServerForRegionException("No server address listed " +
1310               "in " + parentTable + " for region " +
1311               regionInfo.getRegionNameAsString() + " containing row " +
1312               Bytes.toStringBinary(row));
1313           }
1314 
1315           if (isDeadServer(serverName)){
1316             throw new RegionServerStoppedException("hbase:meta says the region "+
1317                 regionInfo.getRegionNameAsString()+" is managed by the server " + serverName +
1318                 ", but it is dead.");
1319           }
1320 
1321           // Instantiate the location
1322           location = new HRegionLocation(regionInfo, serverName,
1323             HRegionInfo.getSeqNumDuringOpen(regionInfoRow));
1324           cacheLocation(tableName, null, location);
1325           return location;
1326         } catch (TableNotFoundException e) {
1327           // if we got this error, probably means the table just plain doesn't
1328           // exist. rethrow the error immediately. this should always be coming
1329           // from the HTable constructor.
1330           throw e;
1331         } catch (IOException e) {
1332           ExceptionUtil.rethrowIfInterrupt(e);
1333 
1334           if (e instanceof RemoteException) {
1335             e = ((RemoteException)e).unwrapRemoteException();
1336           }
1337           if (tries < numTries - 1) {
1338             if (LOG.isDebugEnabled()) {
1339               LOG.debug("locateRegionInMeta parentTable=" +
1340                 parentTable + ", metaLocation=" +
1341                 ((metaLocation == null)? "null": "{" + metaLocation + "}") +
1342                 ", attempt=" + tries + " of " +
1343                 this.numTries + " failed; retrying after sleep of " +
1344                 ConnectionUtils.getPauseTime(this.pause, tries) + " because: " + e.getMessage());
1345             }
1346           } else {
1347             throw e;
1348           }
1349           // Only relocate the parent region if necessary
1350           if(!(e instanceof RegionOfflineException ||
1351               e instanceof NoServerForRegionException)) {
1352             relocateRegion(parentTable, metaKey);
1353           }
1354         }
1355         try{
1356           Thread.sleep(ConnectionUtils.getPauseTime(this.pause, tries));
1357         } catch (InterruptedException e) {
1358           throw new InterruptedIOException("Giving up trying to location region in " +
1359             "meta: thread is interrupted.");
1360         }
1361       }
1362     }
1363 
1364     /*
1365      * Search the cache for a location that fits our table and row key.
1366      * Return null if no suitable region is located.
1367      *
1368      * @param tableName
1369      * @param row
1370      * @return Null or region location found in cache.
1371      */
1372     HRegionLocation getCachedLocation(final TableName tableName,
1373         final byte [] row) {
1374       ConcurrentSkipListMap<byte[], HRegionLocation> tableLocations =
1375         getTableLocations(tableName);
1376 
1377       Entry<byte[], HRegionLocation> e = tableLocations.floorEntry(row);
1378       if (e == null) {
1379         return null;
1380       }
1381       HRegionLocation possibleRegion = e.getValue();
1382 
1383       // make sure that the end key is greater than the row we're looking
1384       // for, otherwise the row actually belongs in the next region, not
1385       // this one. the exception case is when the endkey is
1386       // HConstants.EMPTY_END_ROW, signifying that the region we're
1387       // checking is actually the last region in the table.
1388       byte[] endKey = possibleRegion.getRegionInfo().getEndKey();
1389       if (Bytes.equals(endKey, HConstants.EMPTY_END_ROW) ||
1390           tableName.getRowComparator().compareRows(
1391               endKey, 0, endKey.length, row, 0, row.length) > 0) {
1392         return possibleRegion;
1393       }
1394 
1395       // Passed all the way through, so we got nothing - complete cache miss
1396       return null;
1397     }
1398 
1399     /**
1400      * Delete a cached location, no matter what it is. Called when we were told to not use cache.
1401      * @param tableName tableName
1402      * @param row
1403      */
1404     void forceDeleteCachedLocation(final TableName tableName, final byte [] row) {
1405       HRegionLocation rl = null;
1406       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1407       // start to examine the cache. we can only do cache actions
1408       // if there's something in the cache for this table.
1409       rl = getCachedLocation(tableName, row);
1410       if (rl != null) {
1411         tableLocations.remove(rl.getRegionInfo().getStartKey());
1412       }
1413       if ((rl != null) && LOG.isDebugEnabled()) {
1414         LOG.debug("Removed " + rl.getHostname() + ":" + rl.getPort()
1415           + " as a location of " + rl.getRegionInfo().getRegionNameAsString() +
1416           " for tableName=" + tableName + " from cache");
1417       }
1418     }
1419 
1420     /*
1421      * Delete all cached entries of a table that maps to a specific location.
1422      */
1423     @Override
1424     public void clearCaches(final ServerName serverName) {
1425       if (!this.cachedServers.contains(serverName)) {
1426         return;
1427       }
1428 
1429       boolean deletedSomething = false;
1430       synchronized (this.cachedServers) {
1431         // We block here, because if there is an error on a server, it's likely that multiple
1432         //  threads will get the error  simultaneously. If there are hundreds of thousand of
1433         //  region location to check, it's better to do this only once. A better pattern would
1434         //  be to check if the server is dead when we get the region location.
1435         if (!this.cachedServers.contains(serverName)) {
1436           return;
1437         }
1438         for (Map<byte[], HRegionLocation> tableLocations : cachedRegionLocations.values()) {
1439           for (Entry<byte[], HRegionLocation> e : tableLocations.entrySet()) {
1440             HRegionLocation value = e.getValue();
1441             if (value != null
1442                 && serverName.equals(value.getServerName())) {
1443               tableLocations.remove(e.getKey());
1444               deletedSomething = true;
1445             }
1446           }
1447         }
1448         this.cachedServers.remove(serverName);
1449       }
1450       if (deletedSomething && LOG.isDebugEnabled()) {
1451         LOG.debug("Removed all cached region locations that map to " + serverName);
1452       }
1453     }
1454 
1455     /*
1456      * @param tableName
1457      * @return Map of cached locations for passed <code>tableName</code>
1458      */
1459     private ConcurrentSkipListMap<byte[], HRegionLocation> getTableLocations(
1460         final TableName tableName) {
1461       // find the map of cached locations for this table
1462       ConcurrentSkipListMap<byte[], HRegionLocation> result;
1463       result = this.cachedRegionLocations.get(tableName);
1464       // if tableLocations for this table isn't built yet, make one
1465       if (result == null) {
1466         result = new ConcurrentSkipListMap<byte[], HRegionLocation>(Bytes.BYTES_COMPARATOR);
1467         ConcurrentSkipListMap<byte[], HRegionLocation> old =
1468             this.cachedRegionLocations.putIfAbsent(tableName, result);
1469         if (old != null) {
1470           return old;
1471         }
1472       }
1473       return result;
1474     }
1475 
1476     @Override
1477     public void clearRegionCache() {
1478       this.cachedRegionLocations.clear();
1479       this.cachedServers.clear();
1480     }
1481 
1482     @Override
1483     public void clearRegionCache(final TableName tableName) {
1484       this.cachedRegionLocations.remove(tableName);
1485     }
1486 
1487     @Override
1488     public void clearRegionCache(final byte[] tableName) {
1489       clearRegionCache(TableName.valueOf(tableName));
1490     }
1491 
1492     /**
1493      * Put a newly discovered HRegionLocation into the cache.
1494      * @param tableName The table name.
1495      * @param source the source of the new location, if it's not coming from meta
1496      * @param location the new location
1497      */
1498     private void cacheLocation(final TableName tableName, final HRegionLocation source,
1499         final HRegionLocation location) {
1500       boolean isFromMeta = (source == null);
1501       byte [] startKey = location.getRegionInfo().getStartKey();
1502       ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
1503       HRegionLocation oldLocation = tableLocations.putIfAbsent(startKey, location);
1504       boolean isNewCacheEntry = (oldLocation == null);
1505       if (isNewCacheEntry) {
1506         cachedServers.add(location.getServerName());
1507         return;
1508       }
1509       boolean updateCache;
1510       // If the server in cache sends us a redirect, assume it's always valid.
1511       if (oldLocation.equals(source)) {
1512         updateCache = true;
1513       } else {
1514         long newLocationSeqNum = location.getSeqNum();
1515         // Meta record is stale - some (probably the same) server has closed the region
1516         // with later seqNum and told us about the new location.
1517         boolean isStaleMetaRecord = isFromMeta && (oldLocation.getSeqNum() > newLocationSeqNum);
1518         // Same as above for redirect. However, in this case, if the number is equal to previous
1519         // record, the most common case is that first the region was closed with seqNum, and then
1520         // opened with the same seqNum; hence we will ignore the redirect.
1521         // There are so many corner cases with various combinations of opens and closes that
1522         // an additional counter on top of seqNum would be necessary to handle them all.
1523         boolean isStaleRedirect = !isFromMeta && (oldLocation.getSeqNum() >= newLocationSeqNum);
1524         boolean isStaleUpdate = (isStaleMetaRecord || isStaleRedirect);
1525         updateCache = (!isStaleUpdate);
1526       }
1527       if (updateCache) {
1528         tableLocations.replace(startKey, oldLocation, location);
1529         cachedServers.add(location.getServerName());
1530       }
1531     }
1532 
1533     // Map keyed by service name + regionserver to service stub implementation
1534     private final ConcurrentHashMap<String, Object> stubs =
1535       new ConcurrentHashMap<String, Object>();
1536     // Map of locks used creating service stubs per regionserver.
1537     private final ConcurrentHashMap<String, String> connectionLock =
1538       new ConcurrentHashMap<String, String>();
1539 
1540     /**
1541      * State of the MasterService connection/setup.
1542      */
1543     static class MasterServiceState {
1544       HConnection connection;
1545       MasterService.BlockingInterface stub;
1546       int userCount;
1547       long keepAliveUntil = Long.MAX_VALUE;
1548 
1549       MasterServiceState (final HConnection connection) {
1550         super();
1551         this.connection = connection;
1552       }
1553 
1554       @Override
1555       public String toString() {
1556         return "MasterService";
1557       }
1558 
1559       Object getStub() {
1560         return this.stub;
1561       }
1562 
1563       void clearStub() {
1564         this.stub = null;
1565       }
1566 
1567       boolean isMasterRunning() throws ServiceException {
1568         IsMasterRunningResponse response =
1569           this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1570         return response != null? response.getIsMasterRunning(): false;
1571       }
1572     }
1573 
1574     /**
1575      * Makes a client-side stub for master services. Sub-class to specialize.
1576      * Depends on hosting class so not static.  Exists so we avoid duplicating a bunch of code
1577      * when setting up the MasterMonitorService and MasterAdminService.
1578      */
1579     abstract class StubMaker {
1580       /**
1581        * Returns the name of the service stub being created.
1582        */
1583       protected abstract String getServiceName();
1584 
1585       /**
1586        * Make stub and cache it internal so can be used later doing the isMasterRunning call.
1587        * @param channel
1588        */
1589       protected abstract Object makeStub(final BlockingRpcChannel channel);
1590 
1591       /**
1592        * Once setup, check it works by doing isMasterRunning check.
1593        * @throws ServiceException
1594        */
1595       protected abstract void isMasterRunning() throws ServiceException;
1596 
1597       /**
1598        * Create a stub. Try once only.  It is not typed because there is no common type to
1599        * protobuf services nor their interfaces.  Let the caller do appropriate casting.
1600        * @return A stub for master services.
1601        * @throws IOException
1602        * @throws KeeperException
1603        * @throws ServiceException
1604        */
1605       private Object makeStubNoRetries() throws IOException, KeeperException, ServiceException {
1606         ZooKeeperKeepAliveConnection zkw;
1607         try {
1608           zkw = getKeepAliveZooKeeperWatcher();
1609         } catch (IOException e) {
1610           ExceptionUtil.rethrowIfInterrupt(e);
1611           throw new ZooKeeperConnectionException("Can't connect to ZooKeeper", e);
1612         }
1613         try {
1614           checkIfBaseNodeAvailable(zkw);
1615           ServerName sn = MasterAddressTracker.getMasterAddress(zkw);
1616           if (sn == null) {
1617             String msg = "ZooKeeper available but no active master location found";
1618             LOG.info(msg);
1619             throw new MasterNotRunningException(msg);
1620           }
1621           if (isDeadServer(sn)) {
1622             throw new MasterNotRunningException(sn + " is dead.");
1623           }
1624           // Use the security info interface name as our stub key
1625           String key = getStubKey(getServiceName(), sn.getHostAndPort());
1626           connectionLock.putIfAbsent(key, key);
1627           Object stub = null;
1628           synchronized (connectionLock.get(key)) {
1629             stub = stubs.get(key);
1630             if (stub == null) {
1631               BlockingRpcChannel channel = rpcClient.createBlockingRpcChannel(sn,
1632                 user, rpcTimeout);
1633               stub = makeStub(channel);
1634               isMasterRunning();
1635               stubs.put(key, stub);
1636             }
1637           }
1638           return stub;
1639         } finally {
1640           zkw.close();
1641         }
1642       }
1643 
1644       /**
1645        * Create a stub against the master.  Retry if necessary.
1646        * @return A stub to do <code>intf</code> against the master
1647        * @throws MasterNotRunningException
1648        */
1649       @edu.umd.cs.findbugs.annotations.SuppressWarnings (value="SWL_SLEEP_WITH_LOCK_HELD")
1650       Object makeStub() throws MasterNotRunningException {
1651         // The lock must be at the beginning to prevent multiple master creations
1652         //  (and leaks) in a multithread context
1653         synchronized (masterAndZKLock) {
1654           Exception exceptionCaught = null;
1655           Object stub = null;
1656           int tries = 0;
1657           while (!closed && stub == null) {
1658             tries++;
1659             try {
1660               stub = makeStubNoRetries();
1661             } catch (IOException e) {
1662               exceptionCaught = e;
1663             } catch (KeeperException e) {
1664               exceptionCaught = e;
1665             } catch (ServiceException e) {
1666               exceptionCaught = e;
1667             }
1668 
1669             if (exceptionCaught != null)
1670               // It failed. If it's not the last try, we're going to wait a little
1671               if (tries < numTries && !ExceptionUtil.isInterrupt(exceptionCaught)) {
1672                 // tries at this point is 1 or more; decrement to start from 0.
1673                 long pauseTime = ConnectionUtils.getPauseTime(pause, tries - 1);
1674                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1675                     " failed; retrying after sleep of " + pauseTime + ", exception=" +
1676                   exceptionCaught);
1677 
1678                 try {
1679                   Thread.sleep(pauseTime);
1680                 } catch (InterruptedException e) {
1681                   throw new MasterNotRunningException(
1682                       "Thread was interrupted while trying to connect to master.", e);
1683                 }
1684               } else {
1685                 // Enough tries, we stop now
1686                 LOG.info("getMaster attempt " + tries + " of " + numTries +
1687                     " failed; no more retrying.", exceptionCaught);
1688                 throw new MasterNotRunningException(exceptionCaught);
1689               }
1690           }
1691 
1692           if (stub == null) {
1693             // implies this.closed true
1694             throw new MasterNotRunningException("Connection was closed while trying to get master");
1695           }
1696           return stub;
1697         }
1698       }
1699     }
1700 
1701     /**
1702      * Class to make a MasterServiceStubMaker stub.
1703      */
1704     class MasterServiceStubMaker extends StubMaker {
1705       private MasterService.BlockingInterface stub;
1706       @Override
1707       protected String getServiceName() {
1708         return MasterService.getDescriptor().getName();
1709       }
1710 
1711       @Override
1712       @edu.umd.cs.findbugs.annotations.SuppressWarnings("SWL_SLEEP_WITH_LOCK_HELD")
1713       MasterService.BlockingInterface makeStub() throws MasterNotRunningException {
1714         return (MasterService.BlockingInterface)super.makeStub();
1715       }
1716 
1717       @Override
1718       protected Object makeStub(BlockingRpcChannel channel) {
1719         this.stub = MasterService.newBlockingStub(channel);
1720         return this.stub;
1721       }
1722 
1723       @Override
1724       protected void isMasterRunning() throws ServiceException {
1725         this.stub.isMasterRunning(null, RequestConverter.buildIsMasterRunningRequest());
1726       }
1727     }
1728 
1729     @Override
1730     public AdminService.BlockingInterface getAdmin(final ServerName serverName)
1731         throws IOException {
1732       return getAdmin(serverName, false);
1733     }
1734 
1735     @Override
1736     // Nothing is done w/ the 'master' parameter.  It is ignored.
1737     public AdminService.BlockingInterface getAdmin(final ServerName serverName,
1738       final boolean master)
1739     throws IOException {
1740       if (isDeadServer(serverName)) {
1741         throw new RegionServerStoppedException(serverName + " is dead.");
1742       }
1743       String key = getStubKey(AdminService.BlockingInterface.class.getName(),
1744         serverName.getHostAndPort());
1745       this.connectionLock.putIfAbsent(key, key);
1746       AdminService.BlockingInterface stub = null;
1747       synchronized (this.connectionLock.get(key)) {
1748         stub = (AdminService.BlockingInterface)this.stubs.get(key);
1749         if (stub == null) {
1750           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(serverName,
1751             user, this.rpcTimeout);
1752           stub = AdminService.newBlockingStub(channel);
1753           this.stubs.put(key, stub);
1754         }
1755       }
1756       return stub;
1757     }
1758 
1759     @Override
1760     public ClientService.BlockingInterface getClient(final ServerName sn)
1761     throws IOException {
1762       if (isDeadServer(sn)) {
1763         throw new RegionServerStoppedException(sn + " is dead.");
1764       }
1765       String key = getStubKey(ClientService.BlockingInterface.class.getName(), sn.getHostAndPort());
1766       this.connectionLock.putIfAbsent(key, key);
1767       ClientService.BlockingInterface stub = null;
1768       synchronized (this.connectionLock.get(key)) {
1769         stub = (ClientService.BlockingInterface)this.stubs.get(key);
1770         if (stub == null) {
1771           BlockingRpcChannel channel = this.rpcClient.createBlockingRpcChannel(sn,
1772             user, this.rpcTimeout);
1773           stub = ClientService.newBlockingStub(channel);
1774           // In old days, after getting stub/proxy, we'd make a call.  We are not doing that here.
1775           // Just fail on first actual call rather than in here on setup.
1776           this.stubs.put(key, stub);
1777         }
1778       }
1779       return stub;
1780     }
1781 
1782     static String getStubKey(final String serviceName, final String rsHostnamePort) {
1783       return serviceName + "@" + rsHostnamePort;
1784     }
1785 
1786     private ZooKeeperKeepAliveConnection keepAliveZookeeper;
1787     private AtomicInteger keepAliveZookeeperUserCount = new AtomicInteger(0);
1788     private boolean canCloseZKW = true;
1789 
1790     // keepAlive time, in ms. No reason to make it configurable.
1791     private static final long keepAlive = 5 * 60 * 1000;
1792 
1793     /**
1794      * Retrieve a shared ZooKeeperWatcher. You must close it it once you've have finished with it.
1795      * @return The shared instance. Never returns null.
1796      */
1797     ZooKeeperKeepAliveConnection getKeepAliveZooKeeperWatcher()
1798       throws IOException {
1799       synchronized (masterAndZKLock) {
1800         if (keepAliveZookeeper == null) {
1801           if (this.closed) {
1802             throw new IOException(toString() + " closed");
1803           }
1804           // We don't check that our link to ZooKeeper is still valid
1805           // But there is a retry mechanism in the ZooKeeperWatcher itself
1806           keepAliveZookeeper = new ZooKeeperKeepAliveConnection(conf, this.toString(), this);
1807         }
1808         keepAliveZookeeperUserCount.incrementAndGet();
1809         keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1810         return keepAliveZookeeper;
1811       }
1812     }
1813 
1814     void releaseZooKeeperWatcher(final ZooKeeperWatcher zkw) {
1815       if (zkw == null){
1816         return;
1817       }
1818       synchronized (masterAndZKLock) {
1819         if (keepAliveZookeeperUserCount.decrementAndGet() <= 0 ){
1820           keepZooKeeperWatcherAliveUntil = System.currentTimeMillis() + keepAlive;
1821         }
1822       }
1823     }
1824 
1825     /**
1826      * Creates a Chore thread to check the connections to master & zookeeper
1827      *  and close them when they reach their closing time (
1828      *  {@link MasterServiceState#keepAliveUntil} and
1829      *  {@link #keepZooKeeperWatcherAliveUntil}). Keep alive time is
1830      *  managed by the release functions and the variable {@link #keepAlive}
1831      */
1832     private static class DelayedClosing extends Chore implements Stoppable {
1833       private HConnectionImplementation hci;
1834       Stoppable stoppable;
1835 
1836       private DelayedClosing(
1837         HConnectionImplementation hci, Stoppable stoppable){
1838         super(
1839           "ZooKeeperWatcher and Master delayed closing for connection "+hci,
1840           60*1000, // We check every minutes
1841           stoppable);
1842         this.hci = hci;
1843         this.stoppable = stoppable;
1844       }
1845 
1846       static DelayedClosing createAndStart(HConnectionImplementation hci){
1847         Stoppable stoppable = new Stoppable() {
1848               private volatile boolean isStopped = false;
1849               @Override public void stop(String why) { isStopped = true;}
1850               @Override public boolean isStopped() {return isStopped;}
1851             };
1852 
1853         return new DelayedClosing(hci, stoppable);
1854       }
1855 
1856       protected void closeMasterProtocol(MasterServiceState protocolState) {
1857         if (System.currentTimeMillis() > protocolState.keepAliveUntil) {
1858           hci.closeMasterService(protocolState);
1859           protocolState.keepAliveUntil = Long.MAX_VALUE;
1860         }
1861       }
1862 
1863       @Override
1864       protected void chore() {
1865         synchronized (hci.masterAndZKLock) {
1866           if (hci.canCloseZKW) {
1867             if (System.currentTimeMillis() >
1868               hci.keepZooKeeperWatcherAliveUntil) {
1869 
1870               hci.closeZooKeeperWatcher();
1871               hci.keepZooKeeperWatcherAliveUntil = Long.MAX_VALUE;
1872             }
1873           }
1874           closeMasterProtocol(hci.masterServiceState);
1875           closeMasterProtocol(hci.masterServiceState);
1876         }
1877       }
1878 
1879       @Override
1880       public void stop(String why) {
1881         stoppable.stop(why);
1882       }
1883 
1884       @Override
1885       public boolean isStopped() {
1886         return stoppable.isStopped();
1887       }
1888     }
1889 
1890     private void closeZooKeeperWatcher() {
1891       synchronized (masterAndZKLock) {
1892         if (keepAliveZookeeper != null) {
1893           LOG.info("Closing zookeeper sessionid=0x" +
1894             Long.toHexString(
1895               keepAliveZookeeper.getRecoverableZooKeeper().getSessionId()));
1896           keepAliveZookeeper.internalClose();
1897           keepAliveZookeeper = null;
1898         }
1899         keepAliveZookeeperUserCount.set(0);
1900       }
1901     }
1902 
1903     final MasterServiceState masterServiceState = new MasterServiceState(this);
1904 
1905     @Override
1906     public MasterService.BlockingInterface getMaster() throws MasterNotRunningException {
1907       return getKeepAliveMasterService();
1908     }
1909 
1910     private void resetMasterServiceState(final MasterServiceState mss) {
1911       mss.userCount++;
1912       mss.keepAliveUntil = Long.MAX_VALUE;
1913     }
1914 
1915     @Override
1916     public MasterKeepAliveConnection getKeepAliveMasterService()
1917     throws MasterNotRunningException {
1918       synchronized (masterAndZKLock) {
1919         if (!isKeepAliveMasterConnectedAndRunning(this.masterServiceState)) {
1920           MasterServiceStubMaker stubMaker = new MasterServiceStubMaker();
1921           this.masterServiceState.stub = stubMaker.makeStub();
1922         }
1923         resetMasterServiceState(this.masterServiceState);
1924       }
1925       // Ugly delegation just so we can add in a Close method.
1926       final MasterService.BlockingInterface stub = this.masterServiceState.stub;
1927       return new MasterKeepAliveConnection() {
1928         MasterServiceState mss = masterServiceState;
1929         @Override
1930         public AddColumnResponse addColumn(RpcController controller, AddColumnRequest request)
1931         throws ServiceException {
1932           return stub.addColumn(controller, request);
1933         }
1934 
1935         @Override
1936         public DeleteColumnResponse deleteColumn(RpcController controller,
1937             DeleteColumnRequest request)
1938         throws ServiceException {
1939           return stub.deleteColumn(controller, request);
1940         }
1941 
1942         @Override
1943         public ModifyColumnResponse modifyColumn(RpcController controller,
1944             ModifyColumnRequest request)
1945         throws ServiceException {
1946           return stub.modifyColumn(controller, request);
1947         }
1948 
1949         @Override
1950         public MoveRegionResponse moveRegion(RpcController controller,
1951             MoveRegionRequest request) throws ServiceException {
1952           return stub.moveRegion(controller, request);
1953         }
1954 
1955         @Override
1956         public DispatchMergingRegionsResponse dispatchMergingRegions(
1957             RpcController controller, DispatchMergingRegionsRequest request)
1958             throws ServiceException {
1959           return stub.dispatchMergingRegions(controller, request);
1960         }
1961 
1962         @Override
1963         public AssignRegionResponse assignRegion(RpcController controller,
1964             AssignRegionRequest request) throws ServiceException {
1965           return stub.assignRegion(controller, request);
1966         }
1967 
1968         @Override
1969         public UnassignRegionResponse unassignRegion(RpcController controller,
1970             UnassignRegionRequest request) throws ServiceException {
1971           return stub.unassignRegion(controller, request);
1972         }
1973 
1974         @Override
1975         public OfflineRegionResponse offlineRegion(RpcController controller,
1976             OfflineRegionRequest request) throws ServiceException {
1977           return stub.offlineRegion(controller, request);
1978         }
1979 
1980         @Override
1981         public DeleteTableResponse deleteTable(RpcController controller,
1982             DeleteTableRequest request) throws ServiceException {
1983           return stub.deleteTable(controller, request);
1984         }
1985 
1986         @Override
1987         public EnableTableResponse enableTable(RpcController controller,
1988             EnableTableRequest request) throws ServiceException {
1989           return stub.enableTable(controller, request);
1990         }
1991 
1992         @Override
1993         public DisableTableResponse disableTable(RpcController controller,
1994             DisableTableRequest request) throws ServiceException {
1995           return stub.disableTable(controller, request);
1996         }
1997 
1998         @Override
1999         public ModifyTableResponse modifyTable(RpcController controller,
2000             ModifyTableRequest request) throws ServiceException {
2001           return stub.modifyTable(controller, request);
2002         }
2003 
2004         @Override
2005         public CreateTableResponse createTable(RpcController controller,
2006             CreateTableRequest request) throws ServiceException {
2007           return stub.createTable(controller, request);
2008         }
2009 
2010         @Override
2011         public ShutdownResponse shutdown(RpcController controller,
2012             ShutdownRequest request) throws ServiceException {
2013           return stub.shutdown(controller, request);
2014         }
2015 
2016         @Override
2017         public StopMasterResponse stopMaster(RpcController controller,
2018             StopMasterRequest request) throws ServiceException {
2019           return stub.stopMaster(controller, request);
2020         }
2021 
2022         @Override
2023         public BalanceResponse balance(RpcController controller,
2024             BalanceRequest request) throws ServiceException {
2025           return stub.balance(controller, request);
2026         }
2027 
2028         @Override
2029         public SetBalancerRunningResponse setBalancerRunning(
2030             RpcController controller, SetBalancerRunningRequest request)
2031             throws ServiceException {
2032           return stub.setBalancerRunning(controller, request);
2033         }
2034 
2035         @Override
2036         public RunCatalogScanResponse runCatalogScan(RpcController controller,
2037             RunCatalogScanRequest request) throws ServiceException {
2038           return stub.runCatalogScan(controller, request);
2039         }
2040 
2041         @Override
2042         public EnableCatalogJanitorResponse enableCatalogJanitor(
2043             RpcController controller, EnableCatalogJanitorRequest request)
2044             throws ServiceException {
2045           return stub.enableCatalogJanitor(controller, request);
2046         }
2047 
2048         @Override
2049         public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(
2050             RpcController controller, IsCatalogJanitorEnabledRequest request)
2051             throws ServiceException {
2052           return stub.isCatalogJanitorEnabled(controller, request);
2053         }
2054 
2055         @Override
2056         public CoprocessorServiceResponse execMasterService(
2057             RpcController controller, CoprocessorServiceRequest request)
2058             throws ServiceException {
2059           return stub.execMasterService(controller, request);
2060         }
2061 
2062         @Override
2063         public SnapshotResponse snapshot(RpcController controller,
2064             SnapshotRequest request) throws ServiceException {
2065           return stub.snapshot(controller, request);
2066         }
2067 
2068         @Override
2069         public GetCompletedSnapshotsResponse getCompletedSnapshots(
2070             RpcController controller, GetCompletedSnapshotsRequest request)
2071             throws ServiceException {
2072           return stub.getCompletedSnapshots(controller, request);
2073         }
2074 
2075         @Override
2076         public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2077             DeleteSnapshotRequest request) throws ServiceException {
2078           return stub.deleteSnapshot(controller, request);
2079         }
2080 
2081         @Override
2082         public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2083             IsSnapshotDoneRequest request) throws ServiceException {
2084           return stub.isSnapshotDone(controller, request);
2085         }
2086 
2087         @Override
2088         public RestoreSnapshotResponse restoreSnapshot(
2089             RpcController controller, RestoreSnapshotRequest request)
2090             throws ServiceException {
2091           return stub.restoreSnapshot(controller, request);
2092         }
2093 
2094         @Override
2095         public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(
2096             RpcController controller, IsRestoreSnapshotDoneRequest request)
2097             throws ServiceException {
2098           return stub.isRestoreSnapshotDone(controller, request);
2099         }
2100 
2101         @Override
2102         public ExecProcedureResponse execProcedure(
2103             RpcController controller, ExecProcedureRequest request)
2104             throws ServiceException {
2105           return stub.execProcedure(controller, request);
2106         }
2107 
2108         @Override
2109         public IsProcedureDoneResponse isProcedureDone(RpcController controller,
2110             IsProcedureDoneRequest request) throws ServiceException {
2111           return stub.isProcedureDone(controller, request);
2112         }
2113 
2114         @Override
2115         public IsMasterRunningResponse isMasterRunning(
2116             RpcController controller, IsMasterRunningRequest request)
2117             throws ServiceException {
2118           return stub.isMasterRunning(controller, request);
2119         }
2120 
2121         @Override
2122         public ModifyNamespaceResponse modifyNamespace(RpcController controller,
2123             ModifyNamespaceRequest request)
2124         throws ServiceException {
2125           return stub.modifyNamespace(controller, request);
2126         }
2127 
2128         @Override
2129         public CreateNamespaceResponse createNamespace(RpcController controller, CreateNamespaceRequest request) throws ServiceException {
2130           return stub.createNamespace(controller, request);
2131         }
2132 
2133         @Override
2134         public DeleteNamespaceResponse deleteNamespace(RpcController controller, DeleteNamespaceRequest request) throws ServiceException {
2135           return stub.deleteNamespace(controller, request);
2136         }
2137 
2138         @Override
2139         public GetNamespaceDescriptorResponse getNamespaceDescriptor(RpcController controller, GetNamespaceDescriptorRequest request) throws ServiceException {
2140           return stub.getNamespaceDescriptor(controller, request);
2141         }
2142 
2143         @Override
2144         public ListNamespaceDescriptorsResponse listNamespaceDescriptors(RpcController controller, ListNamespaceDescriptorsRequest request) throws ServiceException {
2145           return stub.listNamespaceDescriptors(controller, request);
2146         }
2147 
2148         @Override
2149         public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(RpcController controller, ListTableDescriptorsByNamespaceRequest request) throws ServiceException {
2150           return stub.listTableDescriptorsByNamespace(controller, request);
2151         }
2152 
2153         @Override
2154         public ListTableNamesByNamespaceResponse listTableNamesByNamespace(RpcController controller,
2155               ListTableNamesByNamespaceRequest request) throws ServiceException {
2156           return stub.listTableNamesByNamespace(controller, request);
2157         }
2158 
2159         @Override
2160         public void close() {
2161           release(this.mss);
2162         }
2163 
2164         @Override
2165         public GetSchemaAlterStatusResponse getSchemaAlterStatus(
2166             RpcController controller, GetSchemaAlterStatusRequest request)
2167             throws ServiceException {
2168           return stub.getSchemaAlterStatus(controller, request);
2169         }
2170 
2171         @Override
2172         public GetTableDescriptorsResponse getTableDescriptors(
2173             RpcController controller, GetTableDescriptorsRequest request)
2174             throws ServiceException {
2175           return stub.getTableDescriptors(controller, request);
2176         }
2177 
2178         @Override
2179         public GetTableNamesResponse getTableNames(
2180             RpcController controller, GetTableNamesRequest request)
2181             throws ServiceException {
2182           return stub.getTableNames(controller, request);
2183         }
2184 
2185         @Override
2186         public GetClusterStatusResponse getClusterStatus(
2187             RpcController controller, GetClusterStatusRequest request)
2188             throws ServiceException {
2189           return stub.getClusterStatus(controller, request);
2190         }
2191 
2192         @Override
2193         public TruncateTableResponse truncateTable(RpcController controller,
2194             TruncateTableRequest request) throws ServiceException {
2195           return stub.truncateTable(controller, request);
2196         }
2197       };
2198     }
2199 
2200 
2201     private static void release(MasterServiceState mss) {
2202       if (mss != null && mss.connection != null) {
2203         ((HConnectionImplementation)mss.connection).releaseMaster(mss);
2204       }
2205     }
2206 
2207     private boolean isKeepAliveMasterConnectedAndRunning(MasterServiceState mss) {
2208       if (mss.getStub() == null){
2209         return false;
2210       }
2211       try {
2212         return mss.isMasterRunning();
2213       } catch (UndeclaredThrowableException e) {
2214         // It's somehow messy, but we can receive exceptions such as
2215         //  java.net.ConnectException but they're not declared. So we catch it...
2216         LOG.info("Master connection is not running anymore", e.getUndeclaredThrowable());
2217         return false;
2218       } catch (ServiceException se) {
2219         LOG.warn("Checking master connection", se);
2220         return false;
2221       }
2222     }
2223 
2224     void releaseMaster(MasterServiceState mss) {
2225       if (mss.getStub() == null) return;
2226       synchronized (masterAndZKLock) {
2227         --mss.userCount;
2228         if (mss.userCount <= 0) {
2229           mss.keepAliveUntil = System.currentTimeMillis() + keepAlive;
2230         }
2231       }
2232     }
2233 
2234     private void closeMasterService(MasterServiceState mss) {
2235       if (mss.getStub() != null) {
2236         LOG.info("Closing master protocol: " + mss);
2237         mss.clearStub();
2238       }
2239       mss.userCount = 0;
2240     }
2241 
2242     /**
2243      * Immediate close of the shared master. Can be by the delayed close or when closing the
2244      * connection itself.
2245      */
2246     private void closeMaster() {
2247       synchronized (masterAndZKLock) {
2248         closeMasterService(masterServiceState);
2249       }
2250     }
2251 
2252     void updateCachedLocation(HRegionInfo hri, HRegionLocation source,
2253                               ServerName serverName, long seqNum) {
2254       HRegionLocation newHrl = new HRegionLocation(hri, serverName, seqNum);
2255       cacheLocation(hri.getTable(), source, newHrl);
2256     }
2257 
2258    /**
2259     * Deletes the cached location of the region if necessary, based on some error from source.
2260     * @param hri The region in question.
2261     * @param source The source of the error that prompts us to invalidate cache.
2262     */
2263    void deleteCachedLocation(HRegionInfo hri, HRegionLocation source) {
2264      ConcurrentMap<byte[], HRegionLocation> tableLocations = getTableLocations(hri.getTable());
2265      tableLocations.remove(hri.getStartKey(), source);
2266    }
2267 
2268     @Override
2269     public void deleteCachedRegionLocation(final HRegionLocation location) {
2270       if (location == null) {
2271         return;
2272       }
2273 
2274       HRegionLocation removedLocation;
2275       TableName tableName = location.getRegionInfo().getTable();
2276       Map<byte[], HRegionLocation> tableLocations = getTableLocations(tableName);
2277       removedLocation = tableLocations.remove(location.getRegionInfo().getStartKey());
2278       if (LOG.isDebugEnabled() && removedLocation != null) {
2279         LOG.debug("Removed " +
2280             location.getRegionInfo().getRegionNameAsString() +
2281             " for tableName=" + tableName +
2282             " from cache");
2283       }
2284     }
2285 
2286     /**
2287      * Update the location with the new value (if the exception is a RegionMovedException)
2288      * or delete it from the cache. Does nothing if we can be sure from the exception that
2289      * the location is still accurate, or if the cache has already been updated.
2290      * @param exception an object (to simplify user code) on which we will try to find a nested
2291      *                  or wrapped or both RegionMovedException
2292      * @param source server that is the source of the location update.
2293      */
2294     @Override
2295     public void updateCachedLocations(final TableName tableName, byte[] rowkey,
2296       final Object exception, final HRegionLocation source) {
2297       if (rowkey == null || tableName == null) {
2298         LOG.warn("Coding error, see method javadoc. row=" + (rowkey == null ? "null" : rowkey) +
2299             ", tableName=" + (tableName == null ? "null" : tableName));
2300         return;
2301       }
2302 
2303       if (source == null || source.getServerName() == null){
2304         // This should not happen, but let's secure ourselves.
2305         return;
2306       }
2307 
2308       // Is it something we have already updated?
2309       final HRegionLocation oldLocation = getCachedLocation(tableName, rowkey);
2310       if (oldLocation == null || !source.getServerName().equals(oldLocation.getServerName())) {
2311         // There is no such location in the cache (it's been removed already) or
2312         // the cache has already been refreshed with a different location.  => nothing to do
2313         return;
2314       }
2315 
2316       HRegionInfo regionInfo = oldLocation.getRegionInfo();
2317       Throwable cause = findException(exception);
2318       if (cause != null) {
2319         if (cause instanceof RegionTooBusyException || cause instanceof RegionOpeningException) {
2320           // We know that the region is still on this region server
2321           return;
2322         }
2323 
2324         if (cause instanceof RegionMovedException) {
2325           RegionMovedException rme = (RegionMovedException) cause;
2326           if (LOG.isTraceEnabled()) {
2327             LOG.trace("Region " + regionInfo.getRegionNameAsString() + " moved to " +
2328                 rme.getHostname() + ":" + rme.getPort() +
2329                 " according to " + source.getHostnamePort());
2330           }
2331           // We know that the region is not anymore on this region server, but we know
2332           //  the new location.
2333           updateCachedLocation(
2334               regionInfo, source, rme.getServerName(), rme.getLocationSeqNum());
2335           return;
2336         }
2337       }
2338 
2339       // If we're here, it means that can cannot be sure about the location, so we remove it from
2340       //  the cache.
2341       deleteCachedLocation(regionInfo, source);
2342     }
2343 
2344     @Override
2345     public void updateCachedLocations(final byte[] tableName, byte[] rowkey,
2346       final Object exception, final HRegionLocation source) {
2347       updateCachedLocations(TableName.valueOf(tableName), rowkey, exception, source);
2348     }
2349 
2350     @Override
2351     @Deprecated
2352     public void processBatch(List<? extends Row> list,
2353         final TableName tableName,
2354         ExecutorService pool,
2355         Object[] results) throws IOException, InterruptedException {
2356       // This belongs in HTable!!! Not in here.  St.Ack
2357 
2358       // results must be the same size as list
2359       if (results.length != list.size()) {
2360         throw new IllegalArgumentException(
2361           "argument results must be the same size as argument list");
2362       }
2363       processBatchCallback(list, tableName, pool, results, null);
2364     }
2365 
2366     @Override
2367     @Deprecated
2368     public void processBatch(List<? extends Row> list,
2369         final byte[] tableName,
2370         ExecutorService pool,
2371         Object[] results) throws IOException, InterruptedException {
2372       processBatch(list, TableName.valueOf(tableName), pool, results);
2373     }
2374 
2375     /**
2376      * Send the queries in parallel on the different region servers. Retries on failures.
2377      * If the method returns it means that there is no error, and the 'results' array will
2378      * contain no exception. On error, an exception is thrown, and the 'results' array will
2379      * contain results and exceptions.
2380      * @deprecated since 0.96 - Use {@link HTable#processBatchCallback} instead
2381      */
2382     @Override
2383     @Deprecated
2384     public <R> void processBatchCallback(
2385       List<? extends Row> list,
2386       TableName tableName,
2387       ExecutorService pool,
2388       Object[] results,
2389       Batch.Callback<R> callback)
2390       throws IOException, InterruptedException {
2391 
2392       // To fulfill the original contract, we have a special callback. This callback
2393       //  will set the results in the Object array.
2394       ObjectResultFiller<R> cb = new ObjectResultFiller<R>(results, callback);
2395       AsyncProcess<?> asyncProcess = createAsyncProcess(tableName, pool, cb, conf);
2396 
2397       // We're doing a submit all. This way, the originalIndex will match the initial list.
2398       asyncProcess.submitAll(list);
2399       asyncProcess.waitUntilDone();
2400 
2401       if (asyncProcess.hasError()) {
2402         throw asyncProcess.getErrors();
2403       }
2404     }
2405 
2406     @Override
2407     @Deprecated
2408     public <R> void processBatchCallback(
2409       List<? extends Row> list,
2410       byte[] tableName,
2411       ExecutorService pool,
2412       Object[] results,
2413       Batch.Callback<R> callback)
2414       throws IOException, InterruptedException {
2415       processBatchCallback(list, TableName.valueOf(tableName), pool, results, callback);
2416     }
2417 
2418     // For tests.
2419     protected <R> AsyncProcess createAsyncProcess(TableName tableName, ExecutorService pool,
2420            AsyncProcess.AsyncProcessCallback<R> callback, Configuration conf) {
2421       return new AsyncProcess<R>(this, tableName, pool, callback, conf,
2422           RpcRetryingCallerFactory.instantiate(conf), RpcControllerFactory.instantiate(conf));
2423     }
2424 
2425 
2426     /**
2427      * Fill the result array for the interfaces using it.
2428      */
2429     private static class ObjectResultFiller<Res>
2430         implements AsyncProcess.AsyncProcessCallback<Res> {
2431 
2432       private final Object[] results;
2433       private Batch.Callback<Res> callback;
2434 
2435       ObjectResultFiller(Object[] results, Batch.Callback<Res> callback) {
2436         this.results = results;
2437         this.callback = callback;
2438       }
2439 
2440       @Override
2441       public void success(int pos, byte[] region, Row row, Res result) {
2442         assert pos < results.length;
2443         results[pos] = result;
2444         if (callback != null) {
2445           callback.update(region, row.getRow(), result);
2446         }
2447       }
2448 
2449       @Override
2450       public boolean failure(int pos, byte[] region, Row row, Throwable t) {
2451         assert pos < results.length;
2452         results[pos] = t;
2453         //Batch.Callback<Res> was not called on failure in 0.94. We keep this.
2454         return true; // we want to have this failure in the failures list.
2455       }
2456 
2457       @Override
2458       public boolean retriableFailure(int originalIndex, Row row, byte[] region,
2459                                       Throwable exception) {
2460         return true; // we retry
2461       }
2462     }
2463 
2464 
2465     /*
2466      * Return the number of cached region for a table. It will only be called
2467      * from a unit test.
2468      */
2469     int getNumberOfCachedRegionLocations(final TableName tableName) {
2470       Map<byte[], HRegionLocation> tableLocs = this.cachedRegionLocations.get(tableName);
2471       if (tableLocs == null) {
2472         return 0;
2473       }
2474       return tableLocs.values().size();
2475     }
2476 
2477     /**
2478      * Check the region cache to see whether a region is cached yet or not.
2479      * Called by unit tests.
2480      * @param tableName tableName
2481      * @param row row
2482      * @return Region cached or not.
2483      */
2484     boolean isRegionCached(TableName tableName, final byte[] row) {
2485       HRegionLocation location = getCachedLocation(tableName, row);
2486       return location != null;
2487     }
2488 
2489     @Override
2490     public void setRegionCachePrefetch(final TableName tableName,
2491         final boolean enable) {
2492       if (!enable) {
2493         regionCachePrefetchDisabledTables.add(Bytes.mapKey(tableName.getName()));
2494       }
2495       else {
2496         regionCachePrefetchDisabledTables.remove(Bytes.mapKey(tableName.getName()));
2497       }
2498     }
2499 
2500     @Override
2501     public void setRegionCachePrefetch(final byte[] tableName,
2502         final boolean enable) {
2503       setRegionCachePrefetch(TableName.valueOf(tableName), enable);
2504     }
2505 
2506     @Override
2507     public boolean getRegionCachePrefetch(TableName tableName) {
2508       return usePrefetch &&
2509           !regionCachePrefetchDisabledTables.contains(Bytes.mapKey(tableName.getName()));
2510     }
2511 
2512     @Override
2513     public boolean getRegionCachePrefetch(byte[] tableName) {
2514       return getRegionCachePrefetch(TableName.valueOf(tableName));
2515     }
2516 
2517     @Override
2518     public void abort(final String msg, Throwable t) {
2519       if (t instanceof KeeperException.SessionExpiredException
2520         && keepAliveZookeeper != null) {
2521         synchronized (masterAndZKLock) {
2522           if (keepAliveZookeeper != null) {
2523             LOG.warn("This client just lost it's session with ZooKeeper," +
2524               " closing it." +
2525               " It will be recreated next time someone needs it", t);
2526             closeZooKeeperWatcher();
2527           }
2528         }
2529       } else {
2530         if (t != null) {
2531           LOG.fatal(msg, t);
2532         } else {
2533           LOG.fatal(msg);
2534         }
2535         this.aborted = true;
2536         close();
2537         this.closed = true;
2538       }
2539     }
2540 
2541     @Override
2542     public boolean isClosed() {
2543       return this.closed;
2544     }
2545 
2546     @Override
2547     public boolean isAborted(){
2548       return this.aborted;
2549     }
2550 
2551     @Override
2552     public int getCurrentNrHRS() throws IOException {
2553       return this.registry.getCurrentNrHRS();
2554     }
2555 
2556     /**
2557      * Increment this client's reference count.
2558      */
2559     void incCount() {
2560       ++refCount;
2561     }
2562 
2563     /**
2564      * Decrement this client's reference count.
2565      */
2566     void decCount() {
2567       if (refCount > 0) {
2568         --refCount;
2569       }
2570     }
2571 
2572     /**
2573      * Return if this client has no reference
2574      *
2575      * @return true if this client has no reference; false otherwise
2576      */
2577     boolean isZeroReference() {
2578       return refCount == 0;
2579     }
2580 
2581     void internalClose() {
2582       if (this.closed) {
2583         return;
2584       }
2585       delayedClosing.stop("Closing connection");
2586       closeMaster();
2587       shutdownBatchPool();
2588       this.closed = true;
2589       closeZooKeeperWatcher();
2590       this.stubs.clear();
2591       if (clusterStatusListener != null) {
2592         clusterStatusListener.close();
2593       }
2594       if (rpcClient != null) {
2595         rpcClient.stop();
2596       }
2597     }
2598 
2599     @Override
2600     public void close() {
2601       if (managed) {
2602         if (aborted) {
2603           HConnectionManager.deleteStaleConnection(this);
2604         } else {
2605           HConnectionManager.deleteConnection(this, false);
2606         }
2607       } else {
2608         internalClose();
2609       }
2610     }
2611 
2612     /**
2613      * Close the connection for good, regardless of what the current value of
2614      * {@link #refCount} is. Ideally, {@link #refCount} should be zero at this
2615      * point, which would be the case if all of its consumers close the
2616      * connection. However, on the off chance that someone is unable to close
2617      * the connection, perhaps because it bailed out prematurely, the method
2618      * below will ensure that this {@link HConnection} instance is cleaned up.
2619      * Caveat: The JVM may take an unknown amount of time to call finalize on an
2620      * unreachable object, so our hope is that every consumer cleans up after
2621      * itself, like any good citizen.
2622      */
2623     @Override
2624     protected void finalize() throws Throwable {
2625       super.finalize();
2626       // Pretend as if we are about to release the last remaining reference
2627       refCount = 1;
2628       close();
2629     }
2630 
2631     @Override
2632     public HTableDescriptor[] listTables() throws IOException {
2633       MasterKeepAliveConnection master = getKeepAliveMasterService();
2634       try {
2635         GetTableDescriptorsRequest req =
2636           RequestConverter.buildGetTableDescriptorsRequest((List<TableName>)null);
2637         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2638       } catch (ServiceException se) {
2639         throw ProtobufUtil.getRemoteException(se);
2640       } finally {
2641         master.close();
2642       }
2643     }
2644 
2645     @Override
2646     public String[] getTableNames() throws IOException {
2647       TableName[] tableNames = listTableNames();
2648       String result[] = new String[tableNames.length];
2649       for (int i = 0; i < tableNames.length; i++) {
2650         result[i] = tableNames[i].getNameAsString();
2651       }
2652       return result;
2653     }
2654 
2655     @Override
2656     public TableName[] listTableNames() throws IOException {
2657       MasterKeepAliveConnection master = getKeepAliveMasterService();
2658       try {
2659         return ProtobufUtil.getTableNameArray(master.getTableNames(null,
2660             GetTableNamesRequest.newBuilder().build())
2661           .getTableNamesList());
2662       } catch (ServiceException se) {
2663         throw ProtobufUtil.getRemoteException(se);
2664       } finally {
2665         master.close();
2666       }
2667     }
2668 
2669     @Override
2670     public HTableDescriptor[] getHTableDescriptorsByTableName(
2671         List<TableName> tableNames) throws IOException {
2672       if (tableNames == null || tableNames.isEmpty()) return new HTableDescriptor[0];
2673       MasterKeepAliveConnection master = getKeepAliveMasterService();
2674       try {
2675         GetTableDescriptorsRequest req =
2676           RequestConverter.buildGetTableDescriptorsRequest(tableNames);
2677         return ProtobufUtil.getHTableDescriptorArray(master.getTableDescriptors(null, req));
2678       } catch (ServiceException se) {
2679         throw ProtobufUtil.getRemoteException(se);
2680       } finally {
2681         master.close();
2682       }
2683     }
2684 
2685     @Override
2686     public HTableDescriptor[] getHTableDescriptors(
2687         List<String> names) throws IOException {
2688       List<TableName> tableNames = new ArrayList(names.size());
2689       for(String name : names) {
2690         tableNames.add(TableName.valueOf(name));
2691       }
2692 
2693       return getHTableDescriptorsByTableName(tableNames);
2694     }
2695 
2696     @Override
2697     public NonceGenerator getNonceGenerator() {
2698       return this.nonceGenerator;
2699     }
2700 
2701     /**
2702      * Connects to the master to get the table descriptor.
2703      * @param tableName table name
2704      * @return
2705      * @throws IOException if the connection to master fails or if the table
2706      *  is not found.
2707      */
2708     @Override
2709     public HTableDescriptor getHTableDescriptor(final TableName tableName)
2710     throws IOException {
2711       if (tableName == null) return null;
2712       MasterKeepAliveConnection master = getKeepAliveMasterService();
2713       GetTableDescriptorsResponse htds;
2714       try {
2715         GetTableDescriptorsRequest req =
2716           RequestConverter.buildGetTableDescriptorsRequest(tableName);
2717         htds = master.getTableDescriptors(null, req);
2718       } catch (ServiceException se) {
2719         throw ProtobufUtil.getRemoteException(se);
2720       } finally {
2721         master.close();
2722       }
2723       if (!htds.getTableSchemaList().isEmpty()) {
2724         return HTableDescriptor.convert(htds.getTableSchemaList().get(0));
2725       }
2726       throw new TableNotFoundException(tableName.getNameAsString());
2727     }
2728 
2729     @Override
2730     public HTableDescriptor getHTableDescriptor(final byte[] tableName)
2731     throws IOException {
2732       return getHTableDescriptor(TableName.valueOf(tableName));
2733     }
2734   }
2735 
2736   /**
2737    * The record of errors for servers.
2738    */
2739   static class ServerErrorTracker {
2740     // We need a concurrent map here, as we could have multiple threads updating it in parallel.
2741     private final ConcurrentMap<HRegionLocation, ServerErrors> errorsByServer =
2742         new ConcurrentHashMap<HRegionLocation, ServerErrors>();
2743     private final long canRetryUntil;
2744     private final int maxRetries;
2745     private final String startTrackingTime;
2746 
2747     public ServerErrorTracker(long timeout, int maxRetries) {
2748       this.maxRetries = maxRetries;
2749       this.canRetryUntil = EnvironmentEdgeManager.currentTimeMillis() + timeout;
2750       this.startTrackingTime = new Date().toString();
2751     }
2752 
2753     /**
2754      * We stop to retry when we have exhausted BOTH the number of retries and the time allocated.
2755      */
2756     boolean canRetryMore(int numRetry) {
2757       // If there is a single try we must not take into account the time.
2758       return numRetry < maxRetries || (maxRetries > 1 &&
2759           EnvironmentEdgeManager.currentTimeMillis() < this.canRetryUntil);
2760     }
2761 
2762     /**
2763      * Calculates the back-off time for a retrying request to a particular server.
2764      *
2765      * @param server    The server in question.
2766      * @param basePause The default hci pause.
2767      * @return The time to wait before sending next request.
2768      */
2769     long calculateBackoffTime(HRegionLocation server, long basePause) {
2770       long result;
2771       ServerErrors errorStats = errorsByServer.get(server);
2772       if (errorStats != null) {
2773         result = ConnectionUtils.getPauseTime(basePause, errorStats.retries.get());
2774       } else {
2775         result = 0; // yes, if the server is not in our list we don't wait before retrying.
2776       }
2777       return result;
2778     }
2779 
2780     /**
2781      * Reports that there was an error on the server to do whatever bean-counting necessary.
2782      *
2783      * @param server The server in question.
2784      */
2785     void reportServerError(HRegionLocation server) {
2786       ServerErrors errors = errorsByServer.get(server);
2787       if (errors != null) {
2788         errors.addError();
2789       } else {
2790         errors = errorsByServer.putIfAbsent(server, new ServerErrors());
2791         if (errors != null){
2792           errors.addError();
2793         }
2794       }
2795     }
2796 
2797     String getStartTrackingTime() {
2798       return startTrackingTime;
2799     }
2800 
2801     /**
2802      * The record of errors for a server.
2803      */
2804     private static class ServerErrors {
2805       public final AtomicInteger retries = new AtomicInteger(0);
2806 
2807       public void addError() {
2808         retries.incrementAndGet();
2809       }
2810     }
2811   }
2812 
2813   /**
2814    * Look for an exception we know in the remote exception:
2815    * - hadoop.ipc wrapped exceptions
2816    * - nested exceptions
2817    *
2818    * Looks for: RegionMovedException / RegionOpeningException / RegionTooBusyException
2819    * @return null if we didn't find the exception, the exception otherwise.
2820    */
2821   public static Throwable findException(Object exception) {
2822     if (exception == null || !(exception instanceof Throwable)) {
2823       return null;
2824     }
2825     Throwable cur = (Throwable) exception;
2826     while (cur != null) {
2827       if (cur instanceof RegionMovedException || cur instanceof RegionOpeningException
2828           || cur instanceof RegionTooBusyException) {
2829         return cur;
2830       }
2831       if (cur instanceof RemoteException) {
2832         RemoteException re = (RemoteException) cur;
2833         cur = re.unwrapRemoteException(
2834             RegionOpeningException.class, RegionMovedException.class,
2835             RegionTooBusyException.class);
2836         if (cur == null) {
2837           cur = re.unwrapRemoteException();
2838         }
2839         // unwrapRemoteException can return the exception given as a parameter when it cannot
2840         //  unwrap it. In this case, there is no need to look further
2841         // noinspection ObjectEquality
2842         if (cur == re) {
2843           return null;
2844         }
2845       } else {
2846         cur = cur.getCause();
2847       }
2848     }
2849 
2850     return null;
2851   }
2852 
2853   /**
2854    * Set the number of retries to use serverside when trying to communicate
2855    * with another server over {@link HConnection}.  Used updating catalog
2856    * tables, etc.  Call this method before we create any Connections.
2857    * @param c The Configuration instance to set the retries into.
2858    * @param log Used to log what we set in here.
2859    */
2860   public static void setServerSideHConnectionRetries(final Configuration c, final String sn,
2861       final Log log) {
2862     int hcRetries = c.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
2863       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
2864     // Go big.  Multiply by 10.  If we can't get to meta after this many retries
2865     // then something seriously wrong.
2866     int serversideMultiplier = c.getInt("hbase.client.serverside.retries.multiplier", 10);
2867     int retries = hcRetries * serversideMultiplier;
2868     c.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
2869     log.debug(sn + " HConnection server-to-server retries=" + retries);
2870   }
2871 }