View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.lang.reflect.Constructor;
23  import java.lang.reflect.InvocationTargetException;
24  import java.net.InetAddress;
25  import java.net.InetSocketAddress;
26  import java.net.UnknownHostException;
27  import java.util.ArrayList;
28  import java.util.Collection;
29  import java.util.Collections;
30  import java.util.Comparator;
31  import java.util.HashSet;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.Set;
35  import java.util.concurrent.Callable;
36  import java.util.concurrent.ExecutionException;
37  import java.util.concurrent.Executors;
38  import java.util.concurrent.Future;
39  import java.util.concurrent.TimeUnit;
40  import java.util.concurrent.atomic.AtomicReference;
41  
42  import javax.management.ObjectName;
43  
44  import org.apache.commons.logging.Log;
45  import org.apache.commons.logging.LogFactory;
46  import org.apache.hadoop.classification.InterfaceAudience;
47  import org.apache.hadoop.conf.Configuration;
48  import org.apache.hadoop.fs.Path;
49  import org.apache.hadoop.hbase.Abortable;
50  import org.apache.hadoop.hbase.Chore;
51  import org.apache.hadoop.hbase.ClusterId;
52  import org.apache.hadoop.hbase.ClusterStatus;
53  import org.apache.hadoop.hbase.HBaseIOException;
54  import org.apache.hadoop.hbase.HColumnDescriptor;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HTableDescriptor;
58  import org.apache.hadoop.hbase.HealthCheckChore;
59  import org.apache.hadoop.hbase.MasterNotRunningException;
60  import org.apache.hadoop.hbase.NamespaceDescriptor;
61  import org.apache.hadoop.hbase.NamespaceNotFoundException;
62  import org.apache.hadoop.hbase.PleaseHoldException;
63  import org.apache.hadoop.hbase.Server;
64  import org.apache.hadoop.hbase.ServerLoad;
65  import org.apache.hadoop.hbase.ServerName;
66  import org.apache.hadoop.hbase.TableDescriptors;
67  import org.apache.hadoop.hbase.TableName;
68  import org.apache.hadoop.hbase.TableNotDisabledException;
69  import org.apache.hadoop.hbase.TableNotFoundException;
70  import org.apache.hadoop.hbase.UnknownRegionException;
71  import org.apache.hadoop.hbase.catalog.CatalogTracker;
72  import org.apache.hadoop.hbase.catalog.MetaReader;
73  import org.apache.hadoop.hbase.client.HConnectionManager;
74  import org.apache.hadoop.hbase.client.MetaScanner;
75  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
76  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
77  import org.apache.hadoop.hbase.client.Result;
78  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
79  import org.apache.hadoop.hbase.exceptions.DeserializationException;
80  import org.apache.hadoop.hbase.exceptions.MergeRegionException;
81  import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
82  import org.apache.hadoop.hbase.executor.ExecutorService;
83  import org.apache.hadoop.hbase.executor.ExecutorType;
84  import org.apache.hadoop.hbase.ipc.FifoRpcScheduler;
85  import org.apache.hadoop.hbase.ipc.RequestContext;
86  import org.apache.hadoop.hbase.ipc.RpcServer;
87  import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
88  import org.apache.hadoop.hbase.ipc.RpcServerInterface;
89  import org.apache.hadoop.hbase.ipc.ServerRpcController;
90  import org.apache.hadoop.hbase.master.RegionState.State;
91  import org.apache.hadoop.hbase.master.balancer.BalancerChore;
92  import org.apache.hadoop.hbase.master.balancer.ClusterStatusChore;
93  import org.apache.hadoop.hbase.master.balancer.LoadBalancerFactory;
94  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
95  import org.apache.hadoop.hbase.master.cleaner.LogCleaner;
96  import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
97  import org.apache.hadoop.hbase.master.handler.DeleteTableHandler;
98  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
99  import org.apache.hadoop.hbase.master.handler.DispatchMergingRegionHandler;
100 import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
101 import org.apache.hadoop.hbase.master.handler.ModifyTableHandler;
102 import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler;
103 import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
104 import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
105 import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
106 import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
107 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
108 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
109 import org.apache.hadoop.hbase.procedure.MasterProcedureManager;
110 import org.apache.hadoop.hbase.procedure.MasterProcedureManagerHost;
111 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
112 import org.apache.hadoop.hbase.protobuf.RequestConverter;
113 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
114 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
115 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos;
116 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
117 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
118 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionServerInfo;
119 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier.RegionSpecifierType;
120 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
121 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.ProcedureDescription;
122 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos;
123 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
124 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
125 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionRequest;
126 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
127 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
128 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
129 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceRequest;
130 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceResponse;
131 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableRequest;
132 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
133 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnRequest;
134 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
135 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceRequest;
136 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceResponse;
137 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotRequest;
138 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
139 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableRequest;
140 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
141 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableRequest;
142 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
143 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsRequest;
144 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
145 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorRequest;
146 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
147 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableRequest;
148 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
149 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusRequest;
150 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusResponse;
151 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsRequest;
152 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
153 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorRequest;
154 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorResponse;
155 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusRequest;
156 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusResponse;
157 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
158 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
159 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
160 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesResponse;
161 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledRequest;
162 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
163 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
164 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
165 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest;
166 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
167 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneRequest;
168 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
169 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsRequest;
170 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
171 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
172 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
173 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
174 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
175 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnRequest;
176 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
177 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceRequest;
178 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
179 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableRequest;
180 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
181 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionRequest;
182 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
183 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionRequest;
184 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
185 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotRequest;
186 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
187 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanRequest;
188 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
189 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
190 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
191 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownRequest;
192 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
193 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotRequest;
194 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
195 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterRequest;
196 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
197 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionRequest;
198 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
199 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest;
200 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse;
201 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest;
202 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse;
203 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos;
204 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
205 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdResponse;
206 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
207 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportResponse;
208 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
209 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
210 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorRequest;
211 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorResponse;
212 import org.apache.hadoop.hbase.replication.regionserver.Replication;
213 import org.apache.hadoop.hbase.security.UserProvider;
214 import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
215 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
216 import org.apache.hadoop.hbase.trace.SpanReceiverHost;
217 import org.apache.hadoop.hbase.util.Bytes;
218 import org.apache.hadoop.hbase.util.CompressionTest;
219 import org.apache.hadoop.hbase.util.FSTableDescriptors;
220 import org.apache.hadoop.hbase.util.FSUtils;
221 import org.apache.hadoop.hbase.util.HFileArchiveUtil;
222 import org.apache.hadoop.hbase.util.HasThread;
223 import org.apache.hadoop.hbase.util.InfoServer;
224 import org.apache.hadoop.hbase.util.JvmPauseMonitor;
225 import org.apache.hadoop.hbase.util.Pair;
226 import org.apache.hadoop.hbase.util.Sleeper;
227 import org.apache.hadoop.hbase.util.Strings;
228 import org.apache.hadoop.hbase.util.Threads;
229 import org.apache.hadoop.hbase.util.VersionInfo;
230 import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
231 import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker;
232 import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker;
233 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
234 import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
235 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
236 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
237 import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
238 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
239 import org.apache.hadoop.metrics.util.MBeanUtil;
240 import org.apache.hadoop.net.DNS;
241 import org.apache.zookeeper.KeeperException;
242 import org.apache.zookeeper.Watcher;
243 
244 import com.google.common.collect.Lists;
245 import com.google.common.collect.Maps;
246 import com.google.protobuf.Descriptors;
247 import com.google.protobuf.Message;
248 import com.google.protobuf.RpcCallback;
249 import com.google.protobuf.RpcController;
250 import com.google.protobuf.Service;
251 import com.google.protobuf.ServiceException;
252 
253 /**
254  * HMaster is the "master server" for HBase. An HBase cluster has one active
255  * master.  If many masters are started, all compete.  Whichever wins goes on to
256  * run the cluster.  All others park themselves in their constructor until
257  * master or cluster shutdown or until the active master loses its lease in
258  * zookeeper.  Thereafter, all running master jostle to take over master role.
259  *
260  * <p>The Master can be asked shutdown the cluster. See {@link #shutdown()}.  In
261  * this case it will tell all regionservers to go down and then wait on them
262  * all reporting in that they are down.  This master will then shut itself down.
263  *
264  * <p>You can also shutdown just this master.  Call {@link #stopMaster()}.
265  *
266  * @see Watcher
267  */
268 @InterfaceAudience.Private
269 @SuppressWarnings("deprecation")
270 public class HMaster extends HasThread implements MasterProtos.MasterService.BlockingInterface,
271 RegionServerStatusProtos.RegionServerStatusService.BlockingInterface,
272 MasterServices, Server {
273   private static final Log LOG = LogFactory.getLog(HMaster.class.getName());
274 
275   // MASTER is name of the webapp and the attribute name used stuffing this
276   //instance into web context.
277   public static final String MASTER = "master";
278 
279   // The configuration for the Master
280   private final Configuration conf;
281   // server for the web ui
282   private InfoServer infoServer;
283 
284   // Our zk client.
285   private ZooKeeperWatcher zooKeeper;
286   // Manager and zk listener for master election
287   private ActiveMasterManager activeMasterManager;
288   // Region server tracker
289   RegionServerTracker regionServerTracker;
290   // Draining region server tracker
291   private DrainingServerTracker drainingServerTracker;
292   // Tracker for load balancer state
293   private LoadBalancerTracker loadBalancerTracker;
294   // master address tracker
295   private MasterAddressTracker masterAddressTracker;
296 
297   // RPC server for the HMaster
298   private final RpcServerInterface rpcServer;
299   private JvmPauseMonitor pauseMonitor;
300   // Set after we've called HBaseServer#openServer and ready to receive RPCs.
301   // Set back to false after we stop rpcServer.  Used by tests.
302   private volatile boolean rpcServerOpen = false;
303 
304   /** Namespace stuff */
305   private TableNamespaceManager tableNamespaceManager;
306   private NamespaceJanitor namespaceJanitorChore;
307 
308   /**
309    * This servers address.
310    */
311   private final InetSocketAddress isa;
312 
313   // Metrics for the HMaster
314   private final MetricsMaster metricsMaster;
315   // file system manager for the master FS operations
316   private MasterFileSystem fileSystemManager;
317 
318   // server manager to deal with region server info
319   ServerManager serverManager;
320 
321   // manager of assignment nodes in zookeeper
322   AssignmentManager assignmentManager;
323   // manager of catalog regions
324   private CatalogTracker catalogTracker;
325   // Cluster status zk tracker and local setter
326   private ClusterStatusTracker clusterStatusTracker;
327 
328   // buffer for "fatal error" notices from region servers
329   // in the cluster. This is only used for assisting
330   // operations/debugging.
331   private MemoryBoundedLogMessageBuffer rsFatals;
332 
333   // This flag is for stopping this Master instance.  Its set when we are
334   // stopping or aborting
335   private volatile boolean stopped = false;
336   // Set on abort -- usually failure of our zk session.
337   private volatile boolean abort = false;
338   // flag set after we become the active master (used for testing)
339   private volatile boolean isActiveMaster = false;
340 
341   // flag set after we complete initialization once active,
342   // it is not private since it's used in unit tests
343   volatile boolean initialized = false;
344 
345   // flag set after we complete assignMeta.
346   private volatile boolean serverShutdownHandlerEnabled = false;
347 
348   // Instance of the hbase executor service.
349   ExecutorService executorService;
350 
351   private LoadBalancer balancer;
352   private Thread balancerChore;
353   private Thread clusterStatusChore;
354   private ClusterStatusPublisher clusterStatusPublisherChore = null;
355 
356   private CatalogJanitor catalogJanitorChore;
357   private LogCleaner logCleaner;
358   private HFileCleaner hfileCleaner;
359 
360   private MasterCoprocessorHost cpHost;
361   private final ServerName serverName;
362 
363   private TableDescriptors tableDescriptors;
364 
365   // Table level lock manager for schema changes
366   private TableLockManager tableLockManager;
367 
368   // Time stamps for when a hmaster was started and when it became active
369   private long masterStartTime;
370   private long masterActiveTime;
371 
372   /** time interval for emitting metrics values */
373   private final int msgInterval;
374   /**
375    * MX Bean for MasterInfo
376    */
377   private ObjectName mxBean = null;
378 
379   //should we check the compression codec type at master side, default true, HBASE-6370
380   private final boolean masterCheckCompression;
381 
382   private SpanReceiverHost spanReceiverHost;
383 
384   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
385 
386   // monitor for snapshot of hbase tables
387   private SnapshotManager snapshotManager;
388   // monitor for distributed procedures
389   private MasterProcedureManagerHost mpmHost;
390 
391   /** The health check chore. */
392   private HealthCheckChore healthCheckChore;
393 
394   /**
395    * is in distributedLogReplay mode. When true, SplitLogWorker directly replays WAL edits to newly
396    * assigned region servers instead of creating recovered.edits files.
397    */
398   private final boolean distributedLogReplay;
399 
400   /** flag used in test cases in order to simulate RS failures during master initialization */
401   private volatile boolean initializationBeforeMetaAssignment = false;
402 
403   /** The following is used in master recovery scenario to re-register listeners */
404   private List<ZooKeeperListener> registeredZKListenersBeforeRecovery;
405 
406   /**
407    * Initializes the HMaster. The steps are as follows:
408    * <p>
409    * <ol>
410    * <li>Initialize HMaster RPC and address
411    * <li>Connect to ZooKeeper.
412    * </ol>
413    * <p>
414    * Remaining steps of initialization occur in {@link #run()} so that they
415    * run in their own thread rather than within the context of the constructor.
416    * @throws InterruptedException
417    */
418   public HMaster(final Configuration conf)
419   throws IOException, KeeperException, InterruptedException {
420     this.conf = new Configuration(conf);
421     // Disable the block cache on the master
422     this.conf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
423     FSUtils.setupShortCircuitRead(conf);
424     // Server to handle client requests.
425     String hostname = Strings.domainNamePointerToHostName(DNS.getDefaultHost(
426       conf.get("hbase.master.dns.interface", "default"),
427       conf.get("hbase.master.dns.nameserver", "default")));
428     int port = conf.getInt(HConstants.MASTER_PORT, HConstants.DEFAULT_MASTER_PORT);
429     // Test that the hostname is reachable
430     InetSocketAddress initialIsa = new InetSocketAddress(hostname, port);
431     if (initialIsa.getAddress() == null) {
432       throw new IllegalArgumentException("Failed resolve of hostname " + initialIsa);
433     }
434     // Verify that the bind address is reachable if set
435     String bindAddress = conf.get("hbase.master.ipc.address");
436     if (bindAddress != null) {
437       initialIsa = new InetSocketAddress(bindAddress, port);
438       if (initialIsa.getAddress() == null) {
439         throw new IllegalArgumentException("Failed resolve of bind address " + initialIsa);
440       }
441     }
442     String name = "master/" + initialIsa.toString();
443     // Set how many times to retry talking to another server over HConnection.
444     HConnectionManager.setServerSideHConnectionRetries(this.conf, name, LOG);
445     int numHandlers = conf.getInt(HConstants.MASTER_HANDLER_COUNT,
446       conf.getInt(HConstants.REGION_SERVER_HANDLER_COUNT, HConstants.DEFAULT_MASTER_HANLDER_COUNT));
447     this.rpcServer = new RpcServer(this, name, getServices(),
448       initialIsa, // BindAddress is IP we got for this server.
449       conf,
450       new FifoRpcScheduler(conf, numHandlers));
451     // Set our address.
452     this.isa = this.rpcServer.getListenerAddress();
453     // We don't want to pass isa's hostname here since it could be 0.0.0.0
454     this.serverName = ServerName.valueOf(hostname, this.isa.getPort(), System.currentTimeMillis());
455     this.rsFatals = new MemoryBoundedLogMessageBuffer(
456       conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));
457 
458     // login the zookeeper client principal (if using security)
459     ZKUtil.loginClient(this.conf, "hbase.zookeeper.client.keytab.file",
460       "hbase.zookeeper.client.kerberos.principal", this.isa.getHostName());
461 
462     // initialize server principal (if using secure Hadoop)
463     UserProvider provider = UserProvider.instantiate(conf);
464     provider.login("hbase.master.keytab.file",
465       "hbase.master.kerberos.principal", this.isa.getHostName());
466 
467     LOG.info("hbase.rootdir=" + FSUtils.getRootDir(this.conf) +
468         ", hbase.cluster.distributed=" + this.conf.getBoolean("hbase.cluster.distributed", false));
469 
470     // set the thread name now we have an address
471     setName(MASTER + ":" + this.serverName.toShortString());
472 
473     Replication.decorateMasterConfiguration(this.conf);
474 
475     // Hack! Maps DFSClient => Master for logs.  HDFS made this
476     // config param for task trackers, but we can piggyback off of it.
477     if (this.conf.get("mapred.task.id") == null) {
478       this.conf.set("mapred.task.id", "hb_m_" + this.serverName.toString());
479     }
480 
481     this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true);
482     this.rpcServer.startThreads();
483     this.pauseMonitor = new JvmPauseMonitor(conf);
484     this.pauseMonitor.start();
485 
486     // metrics interval: using the same property as region server.
487     this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
488 
489     //should we check the compression codec type at master side, default true, HBASE-6370
490     this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
491 
492     this.metricsMaster = new MetricsMaster( new MetricsMasterWrapperImpl(this));
493 
494     // Health checker thread.
495     int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
496       HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
497     if (isHealthCheckerConfigured()) {
498       healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
499     }
500 
501     // Do we publish the status?
502     boolean shouldPublish = conf.getBoolean(HConstants.STATUS_PUBLISHED,
503         HConstants.STATUS_PUBLISHED_DEFAULT);
504     Class<? extends ClusterStatusPublisher.Publisher> publisherClass =
505         conf.getClass(ClusterStatusPublisher.STATUS_PUBLISHER_CLASS,
506             ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS,
507             ClusterStatusPublisher.Publisher.class);
508 
509     if (shouldPublish) {
510       if (publisherClass == null) {
511         LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
512             ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS +
513             " is not set - not publishing status");
514       } else {
515         clusterStatusPublisherChore = new ClusterStatusPublisher(this, conf, publisherClass);
516         Threads.setDaemonThreadRunning(clusterStatusPublisherChore.getThread());
517       }
518     }
519 
520     distributedLogReplay = this.conf.getBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY,
521       HConstants.DEFAULT_DISTRIBUTED_LOG_REPLAY_CONFIG);
522   }
523 
524   /**
525    * @return list of blocking services and their security info classes that this server supports
526    */
527   private List<BlockingServiceAndInterface> getServices() {
528     List<BlockingServiceAndInterface> bssi = new ArrayList<BlockingServiceAndInterface>(3);
529     bssi.add(new BlockingServiceAndInterface(
530         MasterProtos.MasterService.newReflectiveBlockingService(this),
531         MasterProtos.MasterService.BlockingInterface.class));
532     bssi.add(new BlockingServiceAndInterface(
533         RegionServerStatusProtos.RegionServerStatusService.newReflectiveBlockingService(this),
534         RegionServerStatusProtos.RegionServerStatusService.BlockingInterface.class));
535     return bssi;
536   }
537 
538   /**
539    * Stall startup if we are designated a backup master; i.e. we want someone
540    * else to become the master before proceeding.
541    * @param c configuration
542    * @param amm
543    * @throws InterruptedException
544    */
545   private static void stallIfBackupMaster(final Configuration c,
546       final ActiveMasterManager amm)
547   throws InterruptedException {
548     // If we're a backup master, stall until a primary to writes his address
549     if (!c.getBoolean(HConstants.MASTER_TYPE_BACKUP,
550       HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
551       return;
552     }
553     LOG.debug("HMaster started in backup mode.  " +
554       "Stalling until master znode is written.");
555     // This will only be a minute or so while the cluster starts up,
556     // so don't worry about setting watches on the parent znode
557     while (!amm.isActiveMaster()) {
558       LOG.debug("Waiting for master address ZNode to be written " +
559         "(Also watching cluster state node)");
560       Thread.sleep(
561         c.getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT));
562     }
563 
564   }
565 
566   MetricsMaster getMetrics() {
567     return metricsMaster;
568   }
569 
570   /**
571    * Main processing loop for the HMaster.
572    * <ol>
573    * <li>Block until becoming active master
574    * <li>Finish initialization via finishInitialization(MonitoredTask)
575    * <li>Enter loop until we are stopped
576    * <li>Stop services and perform cleanup once stopped
577    * </ol>
578    */
579   @Override
580   public void run() {
581     MonitoredTask startupStatus =
582       TaskMonitor.get().createStatus("Master startup");
583     startupStatus.setDescription("Master startup");
584     masterStartTime = System.currentTimeMillis();
585     try {
586       this.masterAddressTracker = new MasterAddressTracker(getZooKeeperWatcher(), this);
587       this.masterAddressTracker.start();
588 
589       // Put up info server.
590       int port = this.conf.getInt("hbase.master.info.port", 60010);
591       if (port >= 0) {
592         String a = this.conf.get("hbase.master.info.bindAddress", "0.0.0.0");
593         this.infoServer = new InfoServer(MASTER, a, port, false, this.conf);
594         this.infoServer.addServlet("status", "/master-status", MasterStatusServlet.class);
595         this.infoServer.addServlet("dump", "/dump", MasterDumpServlet.class);
596         this.infoServer.setAttribute(MASTER, this);
597         this.infoServer.start();
598       }
599 
600       this.registeredZKListenersBeforeRecovery = this.zooKeeper.getListeners();
601       /*
602        * Block on becoming the active master.
603        *
604        * We race with other masters to write our address into ZooKeeper.  If we
605        * succeed, we are the primary/active master and finish initialization.
606        *
607        * If we do not succeed, there is another active master and we should
608        * now wait until it dies to try and become the next active master.  If we
609        * do not succeed on our first attempt, this is no longer a cluster startup.
610        */
611       becomeActiveMaster(startupStatus);
612 
613       // We are either the active master or we were asked to shutdown
614       if (!this.stopped) {
615         finishInitialization(startupStatus, false);
616         loop();
617       }
618     } catch (Throwable t) {
619       // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
620       if (t instanceof NoClassDefFoundError &&
621           t.getMessage().contains("org/apache/hadoop/hdfs/protocol/FSConstants$SafeModeAction")) {
622           // improved error message for this special case
623           abort("HBase is having a problem with its Hadoop jars.  You may need to "
624               + "recompile HBase against Hadoop version "
625               +  org.apache.hadoop.util.VersionInfo.getVersion()
626               + " or change your hadoop jars to start properly", t);
627       } else {
628         abort("Unhandled exception. Starting shutdown.", t);
629       }
630     } finally {
631       startupStatus.cleanup();
632 
633       stopChores();
634       // Wait for all the remaining region servers to report in IFF we were
635       // running a cluster shutdown AND we were NOT aborting.
636       if (!this.abort && this.serverManager != null &&
637           this.serverManager.isClusterShutdown()) {
638         this.serverManager.letRegionServersShutdown();
639       }
640       stopServiceThreads();
641       // Stop services started for both backup and active masters
642       if (this.activeMasterManager != null) this.activeMasterManager.stop();
643       if (this.catalogTracker != null) this.catalogTracker.stop();
644       if (this.serverManager != null) this.serverManager.stop();
645       if (this.assignmentManager != null) this.assignmentManager.stop();
646       if (this.fileSystemManager != null) this.fileSystemManager.stop();
647       if (this.mpmHost != null) this.mpmHost.stop("server shutting down.");
648       this.zooKeeper.close();
649     }
650     LOG.info("HMaster main thread exiting");
651   }
652 
653   /**
654    * Try becoming active master.
655    * @param startupStatus
656    * @return True if we could successfully become the active master.
657    * @throws InterruptedException
658    */
659   private boolean becomeActiveMaster(MonitoredTask startupStatus)
660   throws InterruptedException {
661     // TODO: This is wrong!!!! Should have new servername if we restart ourselves,
662     // if we come back to life.
663     this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName,
664         this);
665     this.zooKeeper.registerListener(activeMasterManager);
666     stallIfBackupMaster(this.conf, this.activeMasterManager);
667 
668     // The ClusterStatusTracker is setup before the other
669     // ZKBasedSystemTrackers because it's needed by the activeMasterManager
670     // to check if the cluster should be shutdown.
671     this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
672     this.clusterStatusTracker.start();
673     return this.activeMasterManager.blockUntilBecomingActiveMaster(startupStatus);
674   }
675 
676   /**
677    * Initialize all ZK based system trackers.
678    * @throws IOException
679    * @throws InterruptedException
680    */
681   void initializeZKBasedSystemTrackers() throws IOException,
682       InterruptedException, KeeperException {
683     this.catalogTracker = createCatalogTracker(this.zooKeeper, this.conf, this);
684     this.catalogTracker.start();
685 
686     this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
687     this.loadBalancerTracker = new LoadBalancerTracker(zooKeeper, this);
688     this.loadBalancerTracker.start();
689     this.assignmentManager = new AssignmentManager(this, serverManager,
690       this.catalogTracker, this.balancer, this.executorService, this.metricsMaster,
691       this.tableLockManager);
692     zooKeeper.registerListenerFirst(assignmentManager);
693 
694     this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
695         this.serverManager);
696     this.regionServerTracker.start();
697 
698     this.drainingServerTracker = new DrainingServerTracker(zooKeeper, this,
699       this.serverManager);
700     this.drainingServerTracker.start();
701 
702     // Set the cluster as up.  If new RSs, they'll be waiting on this before
703     // going ahead with their startup.
704     boolean wasUp = this.clusterStatusTracker.isClusterUp();
705     if (!wasUp) this.clusterStatusTracker.setClusterUp();
706 
707     LOG.info("Server active/primary master=" + this.serverName +
708         ", sessionid=0x" +
709         Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()) +
710         ", setting cluster-up flag (Was=" + wasUp + ")");
711 
712     // create/initialize the snapshot manager and other procedure managers
713     this.snapshotManager = new SnapshotManager();
714     this.mpmHost = new MasterProcedureManagerHost();
715     this.mpmHost.register(this.snapshotManager);
716     this.mpmHost.loadProcedures(conf);
717     this.mpmHost.initialize(this, this.metricsMaster);
718   }
719 
720   /**
721    * Create CatalogTracker.
722    * In its own method so can intercept and mock it over in tests.
723    * @param zk If zk is null, we'll create an instance (and shut it down
724    * when {@link #stop(String)} is called) else we'll use what is passed.
725    * @param conf
726    * @param abortable If fatal exception we'll call abort on this.  May be null.
727    * If it is we'll use the Connection associated with the passed
728    * {@link Configuration} as our {@link Abortable}.
729    * ({@link Object#wait(long)} when passed a <code>0</code> waits for ever).
730    * @throws IOException
731    */
732   CatalogTracker createCatalogTracker(final ZooKeeperWatcher zk,
733       final Configuration conf, Abortable abortable)
734   throws IOException {
735     return new CatalogTracker(zk, conf, abortable);
736   }
737 
738   // Check if we should stop every 100ms
739   private Sleeper stopSleeper = new Sleeper(100, this);
740 
741   private void loop() {
742     long lastMsgTs = 0l;
743     long now = 0l;
744     while (!this.stopped) {
745       now = System.currentTimeMillis();
746       if ((now - lastMsgTs) >= this.msgInterval) {
747         doMetrics();
748         lastMsgTs = System.currentTimeMillis();
749       }
750       stopSleeper.sleep();
751     }
752   }
753 
754   /**
755    * Emit the HMaster metrics, such as region in transition metrics.
756    * Surrounding in a try block just to be sure metrics doesn't abort HMaster.
757    */
758   private void doMetrics() {
759     try {
760       this.assignmentManager.updateRegionsInTransitionMetrics();
761     } catch (Throwable e) {
762       LOG.error("Couldn't update metrics: " + e.getMessage());
763     }
764   }
765 
766   /**
767    * Finish initialization of HMaster after becoming the primary master.
768    *
769    * <ol>
770    * <li>Initialize master components - file system manager, server manager,
771    *     assignment manager, region server tracker, catalog tracker, etc</li>
772    * <li>Start necessary service threads - rpc server, info server,
773    *     executor services, etc</li>
774    * <li>Set cluster as UP in ZooKeeper</li>
775    * <li>Wait for RegionServers to check-in</li>
776    * <li>Split logs and perform data recovery, if necessary</li>
777    * <li>Ensure assignment of meta regions<li>
778    * <li>Handle either fresh cluster start or master failover</li>
779    * </ol>
780    *
781    * @param masterRecovery
782    *
783    * @throws IOException
784    * @throws InterruptedException
785    * @throws KeeperException
786    */
787   private void finishInitialization(MonitoredTask status, boolean masterRecovery)
788   throws IOException, InterruptedException, KeeperException {
789 
790     isActiveMaster = true;
791 
792     /*
793      * We are active master now... go initialize components we need to run.
794      * Note, there may be dross in zk from previous runs; it'll get addressed
795      * below after we determine if cluster startup or failover.
796      */
797 
798     status.setStatus("Initializing Master file system");
799 
800     this.masterActiveTime = System.currentTimeMillis();
801     // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
802     this.fileSystemManager = new MasterFileSystem(this, this, masterRecovery);
803 
804     this.tableDescriptors =
805       new FSTableDescriptors(this.fileSystemManager.getFileSystem(),
806       this.fileSystemManager.getRootDir());
807 
808     // publish cluster ID
809     status.setStatus("Publishing Cluster ID in ZooKeeper");
810     ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
811 
812     if (!masterRecovery) {
813       this.executorService = new ExecutorService(getServerName().toShortString());
814       this.serverManager = createServerManager(this, this);
815     }
816 
817     //Initialize table lock manager, and ensure that all write locks held previously
818     //are invalidated
819     this.tableLockManager = TableLockManager.createTableLockManager(conf, zooKeeper, serverName);
820     if (!masterRecovery) {
821       this.tableLockManager.reapWriteLocks();
822     }
823 
824     status.setStatus("Initializing ZK system trackers");
825     initializeZKBasedSystemTrackers();
826 
827     if (!masterRecovery) {
828       // initialize master side coprocessors before we start handling requests
829       status.setStatus("Initializing master coprocessors");
830       this.cpHost = new MasterCoprocessorHost(this, this.conf);
831 
832       spanReceiverHost = SpanReceiverHost.getInstance(getConfiguration());
833 
834       // start up all service threads.
835       status.setStatus("Initializing master service threads");
836       startServiceThreads();
837     }
838 
839     // Wait for region servers to report in.
840     this.serverManager.waitForRegionServers(status);
841     // Check zk for region servers that are up but didn't register
842     for (ServerName sn: this.regionServerTracker.getOnlineServers()) {
843       // The isServerOnline check is opportunistic, correctness is handled inside
844       if (!this.serverManager.isServerOnline(sn)
845           && serverManager.checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
846         LOG.info("Registered server found up in zk but who has not yet reported in: " + sn);
847       }
848     }
849 
850     if (!masterRecovery) {
851       this.assignmentManager.startTimeOutMonitor();
852     }
853 
854     // get a list for previously failed RS which need log splitting work
855     // we recover hbase:meta region servers inside master initialization and
856     // handle other failed servers in SSH in order to start up master node ASAP
857     Set<ServerName> previouslyFailedServers = this.fileSystemManager
858         .getFailedServersFromLogFolders();
859 
860     // remove stale recovering regions from previous run
861     this.fileSystemManager.removeStaleRecoveringRegionsFromZK(previouslyFailedServers);
862 
863     // log splitting for hbase:meta server
864     ServerName oldMetaServerLocation = this.catalogTracker.getMetaLocation();
865     if (oldMetaServerLocation != null && previouslyFailedServers.contains(oldMetaServerLocation)) {
866       splitMetaLogBeforeAssignment(oldMetaServerLocation);
867       // Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
868       // may also host user regions
869     }
870     Set<ServerName> previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
871     // need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
872     // instead of previouslyFailedMetaRSs alone to address the following two situations:
873     // 1) the chained failure situation(recovery failed multiple times in a row).
874     // 2) master get killed right before it could delete the recovering hbase:meta from ZK while the
875     // same server still has non-meta wals to be replayed so that
876     // removeStaleRecoveringRegionsFromZK can't delete the stale hbase:meta region
877     // Passing more servers into splitMetaLog is all right. If a server doesn't have hbase:meta wal,
878     // there is no op for the server.
879     previouslyFailedMetaRSs.addAll(previouslyFailedServers);
880 
881     this.initializationBeforeMetaAssignment = true;
882 
883     //initialize load balancer
884     this.balancer.setClusterStatus(getClusterStatus());
885     this.balancer.setMasterServices(this);
886     this.balancer.initialize();
887 
888     // Make sure meta assigned before proceeding.
889     status.setStatus("Assigning Meta Region");
890     assignMeta(status, previouslyFailedMetaRSs);
891     // check if master is shutting down because above assignMeta could return even hbase:meta isn't
892     // assigned when master is shutting down
893     if(this.stopped) return;
894 
895     status.setStatus("Submitting log splitting work for previously failed region servers");
896     // Master has recovered hbase:meta region server and we put
897     // other failed region servers in a queue to be handled later by SSH
898     for (ServerName tmpServer : previouslyFailedServers) {
899       this.serverManager.processDeadServer(tmpServer, true);
900     }
901 
902     // Update meta with new PB serialization if required. i.e migrate all HRI to PB serialization
903     // in meta. This must happen before we assign all user regions or else the assignment will
904     // fail.
905     org.apache.hadoop.hbase.catalog.MetaMigrationConvertingToPB
906       .updateMetaIfNecessary(this);
907 
908     // Fix up assignment manager status
909     status.setStatus("Starting assignment manager");
910     this.assignmentManager.joinCluster();
911 
912     //set cluster status again after user regions are assigned
913     this.balancer.setClusterStatus(getClusterStatus());
914 
915     if (!masterRecovery) {
916       // Start balancer and meta catalog janitor after meta and regions have
917       // been assigned.
918       status.setStatus("Starting balancer and catalog janitor");
919       this.clusterStatusChore = getAndStartClusterStatusChore(this);
920       this.balancerChore = getAndStartBalancerChore(this);
921       this.catalogJanitorChore = new CatalogJanitor(this, this);
922       startCatalogJanitorChore();
923     }
924 
925     status.setStatus("Starting namespace manager");
926     initNamespace();
927 
928     if (this.cpHost != null) {
929       try {
930         this.cpHost.preMasterInitialization();
931       } catch (IOException e) {
932         LOG.error("Coprocessor preMasterInitialization() hook failed", e);
933       }
934     }
935 
936     status.markComplete("Initialization successful");
937     LOG.info("Master has completed initialization");
938     initialized = true;
939     // clear the dead servers with same host name and port of online server because we are not
940     // removing dead server with same hostname and port of rs which is trying to check in before
941     // master initialization. See HBASE-5916.
942     this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
943 
944     if (!masterRecovery) {
945       if (this.cpHost != null) {
946         // don't let cp initialization errors kill the master
947         try {
948           this.cpHost.postStartMaster();
949         } catch (IOException ioe) {
950           LOG.error("Coprocessor postStartMaster() hook failed", ioe);
951         }
952       }
953     }
954   }
955 
956   /**
957    * Useful for testing purpose also where we have
958    * master restart scenarios.
959    */
960   protected void startCatalogJanitorChore() {
961     Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
962   }
963 
964   /**
965    * Useful for testing purpose also where we have
966    * master restart scenarios.
967    */
968   protected void startNamespaceJanitorChore() {
969     Threads.setDaemonThreadRunning(namespaceJanitorChore.getThread());
970   }
971 
972   /**
973    * Create a {@link ServerManager} instance.
974    * @param master
975    * @param services
976    * @return An instance of {@link ServerManager}
977    * @throws org.apache.hadoop.hbase.ZooKeeperConnectionException
978    * @throws IOException
979    */
980   ServerManager createServerManager(final Server master,
981       final MasterServices services)
982   throws IOException {
983     // We put this out here in a method so can do a Mockito.spy and stub it out
984     // w/ a mocked up ServerManager.
985     return new ServerManager(master, services);
986   }
987 
988   /**
989    * Check <code>hbase:meta</code> is assigned. If not, assign it.
990    * @param status MonitoredTask
991    * @param previouslyFailedMetaRSs
992    * @throws InterruptedException
993    * @throws IOException
994    * @throws KeeperException
995    */
996   void assignMeta(MonitoredTask status, Set<ServerName> previouslyFailedMetaRSs)
997       throws InterruptedException, IOException, KeeperException {
998     // Work on meta region
999     int assigned = 0;
1000     long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 1000);
1001     status.setStatus("Assigning hbase:meta region");
1002 
1003     RegionStates regionStates = assignmentManager.getRegionStates();
1004     regionStates.createRegionState(HRegionInfo.FIRST_META_REGIONINFO);
1005     boolean rit = this.assignmentManager
1006       .processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO);
1007     boolean metaRegionLocation = this.catalogTracker.verifyMetaRegionLocation(timeout);
1008     ServerName currentMetaServer = this.catalogTracker.getMetaLocation();
1009     if (!metaRegionLocation) {
1010       // Meta location is not verified. It should be in transition, or offline.
1011       // We will wait for it to be assigned in enableSSHandWaitForMeta below.
1012       assigned++;
1013       if (!rit) {
1014         // Assign meta since not already in transition
1015         if (currentMetaServer != null) {
1016           // If the meta server is not known to be dead or online,
1017           // just split the meta log, and don't expire it since this
1018           // could be a full cluster restart. Otherwise, we will think
1019           // this is a failover and lose previous region locations.
1020           // If it is really a failover case, AM will find out in rebuilding
1021           // user regions. Otherwise, we are good since all logs are split
1022           // or known to be replayed before user regions are assigned.
1023           if (serverManager.isServerOnline(currentMetaServer)) {
1024             LOG.info("Forcing expire of " + currentMetaServer);
1025             serverManager.expireServer(currentMetaServer);
1026           }
1027           splitMetaLogBeforeAssignment(currentMetaServer);
1028           previouslyFailedMetaRSs.add(currentMetaServer);
1029         }
1030         assignmentManager.assignMeta();
1031       }
1032     } else {
1033       // Region already assigned. We didn't assign it. Add to in-memory state.
1034       regionStates.updateRegionState(
1035         HRegionInfo.FIRST_META_REGIONINFO, State.OPEN, currentMetaServer);
1036       this.assignmentManager.regionOnline(
1037         HRegionInfo.FIRST_META_REGIONINFO, currentMetaServer);
1038     }
1039 
1040     enableMeta(TableName.META_TABLE_NAME);
1041 
1042     if (this.distributedLogReplay && (!previouslyFailedMetaRSs.isEmpty())) {
1043       // replay WAL edits mode need new hbase:meta RS is assigned firstly
1044       status.setStatus("replaying log for Meta Region");
1045       this.fileSystemManager.splitMetaLog(previouslyFailedMetaRSs);
1046     }
1047 
1048     // Make sure a hbase:meta location is set. We need to enable SSH here since
1049     // if the meta region server is died at this time, we need it to be re-assigned
1050     // by SSH so that system tables can be assigned.
1051     // No need to wait for meta is assigned = 0 when meta is just verified.
1052     enableServerShutdownHandler(assigned != 0);
1053 
1054     LOG.info("hbase:meta assigned=" + assigned + ", rit=" + rit +
1055       ", location=" + catalogTracker.getMetaLocation());
1056     status.setStatus("META assigned.");
1057   }
1058 
1059   void initNamespace() throws IOException {
1060     //create namespace manager
1061     tableNamespaceManager = new TableNamespaceManager(this);
1062     tableNamespaceManager.start();
1063   }
1064 
1065   private void splitMetaLogBeforeAssignment(ServerName currentMetaServer) throws IOException {
1066     if (this.distributedLogReplay) {
1067       // In log replay mode, we mark hbase:meta region as recovering in ZK
1068       Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
1069       regions.add(HRegionInfo.FIRST_META_REGIONINFO);
1070       this.fileSystemManager.prepareLogReplay(currentMetaServer, regions);
1071     } else {
1072       // In recovered.edits mode: create recovered edits file for hbase:meta server
1073       this.fileSystemManager.splitMetaLog(currentMetaServer);
1074     }
1075   }
1076 
1077   private void enableServerShutdownHandler(
1078       final boolean waitForMeta) throws IOException, InterruptedException {
1079     // If ServerShutdownHandler is disabled, we enable it and expire those dead
1080     // but not expired servers. This is required so that if meta is assigning to
1081     // a server which dies after assignMeta starts assignment,
1082     // SSH can re-assign it. Otherwise, we will be
1083     // stuck here waiting forever if waitForMeta is specified.
1084     if (!serverShutdownHandlerEnabled) {
1085       serverShutdownHandlerEnabled = true;
1086       this.serverManager.processQueuedDeadServers();
1087     }
1088 
1089     if (waitForMeta) {
1090       this.catalogTracker.waitForMeta();
1091       // Above check waits for general meta availability but this does not
1092       // guarantee that the transition has completed
1093       this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
1094     }
1095   }
1096 
1097   private void enableMeta(TableName metaTableName) {
1098     if (!this.assignmentManager.getZKTable().isEnabledTable(metaTableName)) {
1099       this.assignmentManager.setEnabledTable(metaTableName);
1100     }
1101   }
1102 
1103   /**
1104    * This function returns a set of region server names under hbase:meta recovering region ZK node
1105    * @return Set of meta server names which were recorded in ZK
1106    * @throws KeeperException
1107    */
1108   private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
1109     Set<ServerName> result = new HashSet<ServerName>();
1110     String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.recoveringRegionsZNode,
1111       HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
1112     List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
1113     if (regionFailedServers == null) return result;
1114 
1115     for(String failedServer : regionFailedServers) {
1116       ServerName server = ServerName.parseServerName(failedServer);
1117       result.add(server);
1118     }
1119     return result;
1120   }
1121 
1122   @Override
1123   public TableDescriptors getTableDescriptors() {
1124     return this.tableDescriptors;
1125   }
1126 
1127   /** @return InfoServer object. Maybe null.*/
1128   public InfoServer getInfoServer() {
1129     return this.infoServer;
1130   }
1131 
1132   @Override
1133   public Configuration getConfiguration() {
1134     return this.conf;
1135   }
1136 
1137   @Override
1138   public ServerManager getServerManager() {
1139     return this.serverManager;
1140   }
1141 
1142   @Override
1143   public ExecutorService getExecutorService() {
1144     return this.executorService;
1145   }
1146 
1147   @Override
1148   public MasterFileSystem getMasterFileSystem() {
1149     return this.fileSystemManager;
1150   }
1151 
1152   /**
1153    * Get the ZK wrapper object - needed by master_jsp.java
1154    * @return the zookeeper wrapper
1155    */
1156   public ZooKeeperWatcher getZooKeeperWatcher() {
1157     return this.zooKeeper;
1158   }
1159 
1160   public ActiveMasterManager getActiveMasterManager() {
1161     return this.activeMasterManager;
1162   }
1163 
1164   public MasterAddressTracker getMasterAddressTracker() {
1165     return this.masterAddressTracker;
1166   }
1167 
1168   /*
1169    * Start up all services. If any of these threads gets an unhandled exception
1170    * then they just die with a logged message.  This should be fine because
1171    * in general, we do not expect the master to get such unhandled exceptions
1172    *  as OOMEs; it should be lightly loaded. See what HRegionServer does if
1173    *  need to install an unexpected exception handler.
1174    */
1175   void startServiceThreads() throws IOException{
1176    // Start the executor service pools
1177    this.executorService.startExecutorService(ExecutorType.MASTER_OPEN_REGION,
1178       conf.getInt("hbase.master.executor.openregion.threads", 5));
1179    this.executorService.startExecutorService(ExecutorType.MASTER_CLOSE_REGION,
1180       conf.getInt("hbase.master.executor.closeregion.threads", 5));
1181    this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
1182       conf.getInt("hbase.master.executor.serverops.threads", 5));
1183    this.executorService.startExecutorService(ExecutorType.MASTER_META_SERVER_OPERATIONS,
1184       conf.getInt("hbase.master.executor.serverops.threads", 5));
1185    this.executorService.startExecutorService(ExecutorType.M_LOG_REPLAY_OPS,
1186       conf.getInt("hbase.master.executor.logreplayops.threads", 10));
1187 
1188    // We depend on there being only one instance of this executor running
1189    // at a time.  To do concurrency, would need fencing of enable/disable of
1190    // tables.
1191    // Any time changing this maxThreads to > 1, pls see the comment at
1192    // AccessController#postCreateTableHandler
1193    this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS, 1);
1194 
1195    // Start log cleaner thread
1196    String n = Thread.currentThread().getName();
1197    int cleanerInterval = conf.getInt("hbase.master.cleaner.interval", 60 * 1000);
1198    this.logCleaner =
1199       new LogCleaner(cleanerInterval,
1200          this, conf, getMasterFileSystem().getFileSystem(),
1201          getMasterFileSystem().getOldLogDir());
1202          Threads.setDaemonThreadRunning(logCleaner.getThread(), n + ".oldLogCleaner");
1203 
1204    //start the hfile archive cleaner thread
1205     Path archiveDir = HFileArchiveUtil.getArchivePath(conf);
1206     this.hfileCleaner = new HFileCleaner(cleanerInterval, this, conf, getMasterFileSystem()
1207         .getFileSystem(), archiveDir);
1208     Threads.setDaemonThreadRunning(hfileCleaner.getThread(), n + ".archivedHFileCleaner");
1209 
1210     // Start the health checker
1211     if (this.healthCheckChore != null) {
1212       Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker");
1213     }
1214 
1215     // Start allowing requests to happen.
1216     this.rpcServer.openServer();
1217     this.rpcServerOpen = true;
1218     if (LOG.isTraceEnabled()) {
1219       LOG.trace("Started service threads");
1220     }
1221   }
1222 
1223   /**
1224    * Use this when trying to figure when its ok to send in rpcs.  Used by tests.
1225    * @return True if we have successfully run {@link RpcServer#openServer()}
1226    */
1227   boolean isRpcServerOpen() {
1228     return this.rpcServerOpen;
1229   }
1230 
1231   private void stopServiceThreads() {
1232     if (LOG.isDebugEnabled()) {
1233       LOG.debug("Stopping service threads");
1234     }
1235     if (this.rpcServer != null) this.rpcServer.stop();
1236     this.rpcServerOpen = false;
1237     // Clean up and close up shop
1238     if (this.logCleaner!= null) this.logCleaner.interrupt();
1239     if (this.hfileCleaner != null) this.hfileCleaner.interrupt();
1240 
1241     if (this.infoServer != null) {
1242       LOG.info("Stopping infoServer");
1243       try {
1244         this.infoServer.stop();
1245       } catch (Exception ex) {
1246         ex.printStackTrace();
1247       }
1248     }
1249     if (this.executorService != null) this.executorService.shutdown();
1250     if (this.healthCheckChore != null) {
1251       this.healthCheckChore.interrupt();
1252     }
1253     if (this.pauseMonitor != null) {
1254       this.pauseMonitor.stop();
1255     }
1256   }
1257 
1258   private static Thread getAndStartClusterStatusChore(HMaster master) {
1259     if (master == null || master.balancer == null) {
1260       return null;
1261     }
1262     Chore chore = new ClusterStatusChore(master, master.balancer);
1263     return Threads.setDaemonThreadRunning(chore.getThread());
1264   }
1265 
1266   private static Thread getAndStartBalancerChore(final HMaster master) {
1267     // Start up the load balancer chore
1268     Chore chore = new BalancerChore(master);
1269     return Threads.setDaemonThreadRunning(chore.getThread());
1270   }
1271 
1272   private void stopChores() {
1273     if (this.balancerChore != null) {
1274       this.balancerChore.interrupt();
1275     }
1276     if (this.clusterStatusChore != null) {
1277       this.clusterStatusChore.interrupt();
1278     }
1279     if (this.catalogJanitorChore != null) {
1280       this.catalogJanitorChore.interrupt();
1281     }
1282     if (this.clusterStatusPublisherChore != null){
1283       clusterStatusPublisherChore.interrupt();
1284     }
1285     if (this.namespaceJanitorChore != null){
1286       namespaceJanitorChore.interrupt();
1287     }
1288   }
1289 
1290   @Override
1291   public RegionServerStartupResponse regionServerStartup(
1292       RpcController controller, RegionServerStartupRequest request) throws ServiceException {
1293     // Register with server manager
1294     try {
1295       InetAddress ia = getRemoteInetAddress(request.getPort(), request.getServerStartCode());
1296       ServerName rs = this.serverManager.regionServerStartup(ia, request.getPort(),
1297         request.getServerStartCode(), request.getServerCurrentTime());
1298 
1299       // Send back some config info
1300       RegionServerStartupResponse.Builder resp = createConfigurationSubset();
1301       NameStringPair.Builder entry = NameStringPair.newBuilder()
1302         .setName(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)
1303         .setValue(rs.getHostname());
1304       resp.addMapEntries(entry.build());
1305 
1306       return resp.build();
1307     } catch (IOException ioe) {
1308       throw new ServiceException(ioe);
1309     }
1310   }
1311 
1312   /**
1313    * @return Get remote side's InetAddress
1314    * @throws UnknownHostException
1315    */
1316   InetAddress getRemoteInetAddress(final int port, final long serverStartCode)
1317   throws UnknownHostException {
1318     // Do it out here in its own little method so can fake an address when
1319     // mocking up in tests.
1320     return RpcServer.getRemoteIp();
1321   }
1322 
1323   /**
1324    * @return Subset of configuration to pass initializing regionservers: e.g.
1325    * the filesystem to use and root directory to use.
1326    */
1327   protected RegionServerStartupResponse.Builder createConfigurationSubset() {
1328     RegionServerStartupResponse.Builder resp = addConfig(
1329       RegionServerStartupResponse.newBuilder(), HConstants.HBASE_DIR);
1330     return addConfig(resp, "fs.default.name");
1331   }
1332 
1333   private RegionServerStartupResponse.Builder addConfig(
1334       final RegionServerStartupResponse.Builder resp, final String key) {
1335     NameStringPair.Builder entry = NameStringPair.newBuilder()
1336       .setName(key)
1337       .setValue(this.conf.get(key));
1338     resp.addMapEntries(entry.build());
1339     return resp;
1340   }
1341 
1342   @Override
1343   public GetLastFlushedSequenceIdResponse getLastFlushedSequenceId(RpcController controller,
1344       GetLastFlushedSequenceIdRequest request) throws ServiceException {
1345     byte[] regionName = request.getRegionName().toByteArray();
1346     long seqId = serverManager.getLastFlushedSequenceId(regionName);
1347     return ResponseConverter.buildGetLastFlushedSequenceIdResponse(seqId);
1348   }
1349 
1350   @Override
1351   public RegionServerReportResponse regionServerReport(
1352       RpcController controller, RegionServerReportRequest request) throws ServiceException {
1353     try {
1354       ClusterStatusProtos.ServerLoad sl = request.getLoad();
1355       ServerName serverName = ProtobufUtil.toServerName(request.getServer());
1356       ServerLoad oldLoad = serverManager.getLoad(serverName);
1357       this.serverManager.regionServerReport(serverName, new ServerLoad(sl));
1358       if (sl != null && this.metricsMaster != null) {
1359         // Up our metrics.
1360         this.metricsMaster.incrementRequests(sl.getTotalNumberOfRequests()
1361           - (oldLoad != null ? oldLoad.getTotalNumberOfRequests() : 0));
1362       }
1363     } catch (IOException ioe) {
1364       throw new ServiceException(ioe);
1365     }
1366 
1367     return RegionServerReportResponse.newBuilder().build();
1368   }
1369 
1370   @Override
1371   public ReportRSFatalErrorResponse reportRSFatalError(
1372       RpcController controller, ReportRSFatalErrorRequest request) throws ServiceException {
1373     String errorText = request.getErrorMessage();
1374     ServerName sn = ProtobufUtil.toServerName(request.getServer());
1375     String msg = "Region server " + sn +
1376       " reported a fatal error:\n" + errorText;
1377     LOG.error(msg);
1378     rsFatals.add(msg);
1379 
1380     return ReportRSFatalErrorResponse.newBuilder().build();
1381   }
1382 
1383   public boolean isMasterRunning() {
1384     return !isStopped();
1385   }
1386 
1387   @Override
1388   public IsMasterRunningResponse isMasterRunning(RpcController c, IsMasterRunningRequest req)
1389   throws ServiceException {
1390     return IsMasterRunningResponse.newBuilder().setIsMasterRunning(isMasterRunning()).build();
1391   }
1392 
1393   @Override
1394   public RunCatalogScanResponse runCatalogScan(RpcController c,
1395       RunCatalogScanRequest req) throws ServiceException {
1396     try {
1397       return ResponseConverter.buildRunCatalogScanResponse(catalogJanitorChore.scan());
1398     } catch (IOException ioe) {
1399       throw new ServiceException(ioe);
1400     }
1401   }
1402 
1403   @Override
1404   public EnableCatalogJanitorResponse enableCatalogJanitor(RpcController c,
1405       EnableCatalogJanitorRequest req) throws ServiceException {
1406     return EnableCatalogJanitorResponse.newBuilder().
1407         setPrevValue(catalogJanitorChore.setEnabled(req.getEnable())).build();
1408   }
1409 
1410   @Override
1411   public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(RpcController c,
1412       IsCatalogJanitorEnabledRequest req) throws ServiceException {
1413     boolean isEnabled = catalogJanitorChore != null ? catalogJanitorChore.getEnabled() : false;
1414     return IsCatalogJanitorEnabledResponse.newBuilder().setValue(isEnabled).build();
1415   }
1416 
1417   /**
1418    * @return Maximum time we should run balancer for
1419    */
1420   private int getBalancerCutoffTime() {
1421     int balancerCutoffTime =
1422       getConfiguration().getInt("hbase.balancer.max.balancing", -1);
1423     if (balancerCutoffTime == -1) {
1424       // No time period set so create one
1425       int balancerPeriod =
1426         getConfiguration().getInt("hbase.balancer.period", 300000);
1427       balancerCutoffTime = balancerPeriod;
1428       // If nonsense period, set it to balancerPeriod
1429       if (balancerCutoffTime <= 0) balancerCutoffTime = balancerPeriod;
1430     }
1431     return balancerCutoffTime;
1432   }
1433 
1434   public boolean balance() throws HBaseIOException {
1435     // if master not initialized, don't run balancer.
1436     if (!this.initialized) {
1437       LOG.debug("Master has not been initialized, don't run balancer.");
1438       return false;
1439     }
1440     // Do this call outside of synchronized block.
1441     int maximumBalanceTime = getBalancerCutoffTime();
1442     boolean balancerRan;
1443     synchronized (this.balancer) {
1444       // If balance not true, don't run balancer.
1445       if (!this.loadBalancerTracker.isBalancerOn()) return false;
1446       // Only allow one balance run at at time.
1447       if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {
1448         Map<String, RegionState> regionsInTransition =
1449           this.assignmentManager.getRegionStates().getRegionsInTransition();
1450         LOG.debug("Not running balancer because " + regionsInTransition.size() +
1451           " region(s) in transition: " + org.apache.commons.lang.StringUtils.
1452             abbreviate(regionsInTransition.toString(), 256));
1453         return false;
1454       }
1455       if (this.serverManager.areDeadServersInProgress()) {
1456         LOG.debug("Not running balancer because processing dead regionserver(s): " +
1457           this.serverManager.getDeadServers());
1458         return false;
1459       }
1460 
1461       if (this.cpHost != null) {
1462         try {
1463           if (this.cpHost.preBalance()) {
1464             LOG.debug("Coprocessor bypassing balancer request");
1465             return false;
1466           }
1467         } catch (IOException ioe) {
1468           LOG.error("Error invoking master coprocessor preBalance()", ioe);
1469           return false;
1470         }
1471       }
1472 
1473       Map<TableName, Map<ServerName, List<HRegionInfo>>> assignmentsByTable =
1474         this.assignmentManager.getRegionStates().getAssignmentsByTable();
1475 
1476       List<RegionPlan> plans = new ArrayList<RegionPlan>();
1477       //Give the balancer the current cluster state.
1478       this.balancer.setClusterStatus(getClusterStatus());
1479       for (Map<ServerName, List<HRegionInfo>> assignments : assignmentsByTable.values()) {
1480         List<RegionPlan> partialPlans = this.balancer.balanceCluster(assignments);
1481         if (partialPlans != null) plans.addAll(partialPlans);
1482       }
1483       long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;
1484       int rpCount = 0;  // number of RegionPlans balanced so far
1485       long totalRegPlanExecTime = 0;
1486       balancerRan = plans != null;
1487       if (plans != null && !plans.isEmpty()) {
1488         for (RegionPlan plan: plans) {
1489           LOG.info("balance " + plan);
1490           long balStartTime = System.currentTimeMillis();
1491           //TODO: bulk assign
1492           this.assignmentManager.balance(plan);
1493           totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;
1494           rpCount++;
1495           if (rpCount < plans.size() &&
1496               // if performing next balance exceeds cutoff time, exit the loop
1497               (System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {
1498             //TODO: After balance, there should not be a cutoff time (keeping it as a security net for now)
1499             LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +
1500               maximumBalanceTime);
1501             break;
1502           }
1503         }
1504       }
1505       if (this.cpHost != null) {
1506         try {
1507           this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);
1508         } catch (IOException ioe) {
1509           // balancing already succeeded so don't change the result
1510           LOG.error("Error invoking master coprocessor postBalance()", ioe);
1511         }
1512       }
1513     }
1514     return balancerRan;
1515   }
1516 
1517   @Override
1518   public BalanceResponse balance(RpcController c, BalanceRequest request) throws ServiceException {
1519     try {
1520       return BalanceResponse.newBuilder().setBalancerRan(balance()).build();
1521     } catch (HBaseIOException ex) {
1522       throw new ServiceException(ex);
1523     }
1524   }
1525 
1526   enum BalanceSwitchMode {
1527     SYNC,
1528     ASYNC
1529   }
1530 
1531   /**
1532    * Assigns balancer switch according to BalanceSwitchMode
1533    * @param b new balancer switch
1534    * @param mode BalanceSwitchMode
1535    * @return old balancer switch
1536    */
1537   public boolean switchBalancer(final boolean b, BalanceSwitchMode mode) throws IOException {
1538     boolean oldValue = this.loadBalancerTracker.isBalancerOn();
1539     boolean newValue = b;
1540     try {
1541       if (this.cpHost != null) {
1542         newValue = this.cpHost.preBalanceSwitch(newValue);
1543       }
1544       try {
1545         if (mode == BalanceSwitchMode.SYNC) {
1546           synchronized (this.balancer) {
1547             this.loadBalancerTracker.setBalancerOn(newValue);
1548           }
1549         } else {
1550           this.loadBalancerTracker.setBalancerOn(newValue);
1551         }
1552       } catch (KeeperException ke) {
1553         throw new IOException(ke);
1554       }
1555       LOG.info(getClientIdAuditPrefix() + " set balanceSwitch=" + newValue);
1556       if (this.cpHost != null) {
1557         this.cpHost.postBalanceSwitch(oldValue, newValue);
1558       }
1559     } catch (IOException ioe) {
1560       LOG.warn("Error flipping balance switch", ioe);
1561     }
1562     return oldValue;
1563   }
1564 
1565   /**
1566    * @return Client info for use as prefix on an audit log string; who did an action
1567    */
1568   String getClientIdAuditPrefix() {
1569     return "Client=" + RequestContext.getRequestUserName() + "/" +
1570       RequestContext.get().getRemoteAddress();
1571   }
1572 
1573   public boolean synchronousBalanceSwitch(final boolean b) throws IOException {
1574     return switchBalancer(b, BalanceSwitchMode.SYNC);
1575   }
1576 
1577   public boolean balanceSwitch(final boolean b) throws IOException {
1578     return switchBalancer(b, BalanceSwitchMode.ASYNC);
1579   }
1580 
1581   @Override
1582   public SetBalancerRunningResponse setBalancerRunning(
1583       RpcController controller, SetBalancerRunningRequest req) throws ServiceException {
1584     try {
1585       boolean prevValue = (req.getSynchronous())?
1586         synchronousBalanceSwitch(req.getOn()):balanceSwitch(req.getOn());
1587       return SetBalancerRunningResponse.newBuilder().setPrevBalanceValue(prevValue).build();
1588     } catch (IOException ioe) {
1589       throw new ServiceException(ioe);
1590     }
1591   }
1592 
1593   /**
1594    * Switch for the background CatalogJanitor thread.
1595    * Used for testing.  The thread will continue to run.  It will just be a noop
1596    * if disabled.
1597    * @param b If false, the catalog janitor won't do anything.
1598    */
1599   public void setCatalogJanitorEnabled(final boolean b) {
1600     this.catalogJanitorChore.setEnabled(b);
1601   }
1602 
1603   @Override
1604   public DispatchMergingRegionsResponse dispatchMergingRegions(
1605       RpcController controller, DispatchMergingRegionsRequest request)
1606       throws ServiceException {
1607     final byte[] encodedNameOfRegionA = request.getRegionA().getValue()
1608         .toByteArray();
1609     final byte[] encodedNameOfRegionB = request.getRegionB().getValue()
1610         .toByteArray();
1611     final boolean forcible = request.getForcible();
1612     if (request.getRegionA().getType() != RegionSpecifierType.ENCODED_REGION_NAME
1613         || request.getRegionB().getType() != RegionSpecifierType.ENCODED_REGION_NAME) {
1614       LOG.warn("mergeRegions specifier type: expected: "
1615           + RegionSpecifierType.ENCODED_REGION_NAME + " actual: region_a="
1616           + request.getRegionA().getType() + ", region_b="
1617           + request.getRegionB().getType());
1618     }
1619     RegionState regionStateA = assignmentManager.getRegionStates()
1620         .getRegionState(Bytes.toString(encodedNameOfRegionA));
1621     RegionState regionStateB = assignmentManager.getRegionStates()
1622         .getRegionState(Bytes.toString(encodedNameOfRegionB));
1623     if (regionStateA == null || regionStateB == null) {
1624       throw new ServiceException(new UnknownRegionException(
1625           Bytes.toStringBinary(regionStateA == null ? encodedNameOfRegionA
1626               : encodedNameOfRegionB)));
1627     }
1628 
1629     if (!regionStateA.isOpened() || !regionStateB.isOpened()) {
1630       throw new ServiceException(new MergeRegionException(
1631         "Unable to merge regions not online " + regionStateA + ", " + regionStateB));
1632     }
1633 
1634     HRegionInfo regionInfoA = regionStateA.getRegion();
1635     HRegionInfo regionInfoB = regionStateB.getRegion();
1636     if (regionInfoA.compareTo(regionInfoB) == 0) {
1637       throw new ServiceException(new MergeRegionException(
1638         "Unable to merge a region to itself " + regionInfoA + ", " + regionInfoB));
1639     }
1640 
1641     if (!forcible && !HRegionInfo.areAdjacent(regionInfoA, regionInfoB)) {
1642       throw new ServiceException(new MergeRegionException(
1643         "Unable to merge not adjacent regions "
1644           + regionInfoA.getRegionNameAsString() + ", "
1645           + regionInfoB.getRegionNameAsString()
1646           + " where forcible = " + forcible));
1647     }
1648 
1649     try {
1650       dispatchMergingRegions(regionInfoA, regionInfoB, forcible);
1651     } catch (IOException ioe) {
1652       throw new ServiceException(ioe);
1653     }
1654 
1655     return DispatchMergingRegionsResponse.newBuilder().build();
1656   }
1657 
1658   @Override
1659   public void dispatchMergingRegions(final HRegionInfo region_a,
1660       final HRegionInfo region_b, final boolean forcible) throws IOException {
1661     checkInitialized();
1662     this.executorService.submit(new DispatchMergingRegionHandler(this,
1663         this.catalogJanitorChore, region_a, region_b, forcible));
1664   }
1665 
1666   @Override
1667   public MoveRegionResponse moveRegion(RpcController controller, MoveRegionRequest req)
1668   throws ServiceException {
1669     final byte [] encodedRegionName = req.getRegion().getValue().toByteArray();
1670     RegionSpecifierType type = req.getRegion().getType();
1671     final byte [] destServerName = (req.hasDestServerName())?
1672       Bytes.toBytes(ProtobufUtil.toServerName(req.getDestServerName()).getServerName()):null;
1673     MoveRegionResponse mrr = MoveRegionResponse.newBuilder().build();
1674 
1675     if (type != RegionSpecifierType.ENCODED_REGION_NAME) {
1676       LOG.warn("moveRegion specifier type: expected: " + RegionSpecifierType.ENCODED_REGION_NAME
1677         + " actual: " + type);
1678     }
1679 
1680     try {
1681       move(encodedRegionName, destServerName);
1682     } catch (HBaseIOException ioe) {
1683       throw new ServiceException(ioe);
1684     }
1685     return mrr;
1686   }
1687 
1688   void move(final byte[] encodedRegionName,
1689       final byte[] destServerName) throws HBaseIOException {
1690     RegionState regionState = assignmentManager.getRegionStates().
1691       getRegionState(Bytes.toString(encodedRegionName));
1692     if (regionState == null) {
1693       throw new UnknownRegionException(Bytes.toStringBinary(encodedRegionName));
1694     }
1695 
1696     HRegionInfo hri = regionState.getRegion();
1697     ServerName dest;
1698     if (destServerName == null || destServerName.length == 0) {
1699       LOG.info("Passed destination servername is null/empty so " +
1700         "choosing a server at random");
1701       final List<ServerName> destServers = this.serverManager.createDestinationServersList(
1702         regionState.getServerName());
1703       dest = balancer.randomAssignment(hri, destServers);
1704     } else {
1705       dest = ServerName.valueOf(Bytes.toString(destServerName));
1706       if (dest.equals(regionState.getServerName())) {
1707         LOG.debug("Skipping move of region " + hri.getRegionNameAsString()
1708           + " because region already assigned to the same server " + dest + ".");
1709         return;
1710       }
1711     }
1712 
1713     // Now we can do the move
1714     RegionPlan rp = new RegionPlan(hri, regionState.getServerName(), dest);
1715 
1716     try {
1717       checkInitialized();
1718       if (this.cpHost != null) {
1719         if (this.cpHost.preMove(hri, rp.getSource(), rp.getDestination())) {
1720           return;
1721         }
1722       }
1723       LOG.info(getClientIdAuditPrefix() + " move " + rp + ", running balancer");
1724       this.assignmentManager.balance(rp);
1725       if (this.cpHost != null) {
1726         this.cpHost.postMove(hri, rp.getSource(), rp.getDestination());
1727       }
1728     } catch (IOException ioe) {
1729       if (ioe instanceof HBaseIOException) {
1730         throw (HBaseIOException)ioe;
1731       }
1732       throw new HBaseIOException(ioe);
1733     }
1734   }
1735 
1736   @Override
1737   public void createTable(HTableDescriptor hTableDescriptor,
1738     byte [][] splitKeys)
1739   throws IOException {
1740     if (!isMasterRunning()) {
1741       throw new MasterNotRunningException();
1742     }
1743 
1744     String namespace = hTableDescriptor.getTableName().getNamespaceAsString();
1745     getNamespaceDescriptor(namespace); // ensure namespace exists
1746 
1747     HRegionInfo[] newRegions = getHRegionInfos(hTableDescriptor, splitKeys);
1748     checkInitialized();
1749     checkCompression(hTableDescriptor);
1750     if (cpHost != null) {
1751       cpHost.preCreateTable(hTableDescriptor, newRegions);
1752     }
1753     LOG.info(getClientIdAuditPrefix() + " create " + hTableDescriptor);
1754     this.executorService.submit(new CreateTableHandler(this,
1755       this.fileSystemManager, hTableDescriptor, conf,
1756       newRegions, this).prepare());
1757     if (cpHost != null) {
1758       cpHost.postCreateTable(hTableDescriptor, newRegions);
1759     }
1760 
1761   }
1762 
1763   private void checkCompression(final HTableDescriptor htd)
1764   throws IOException {
1765     if (!this.masterCheckCompression) return;
1766     for (HColumnDescriptor hcd : htd.getColumnFamilies()) {
1767       checkCompression(hcd);
1768     }
1769   }
1770 
1771   private void checkCompression(final HColumnDescriptor hcd)
1772   throws IOException {
1773     if (!this.masterCheckCompression) return;
1774     CompressionTest.testCompression(hcd.getCompression());
1775     CompressionTest.testCompression(hcd.getCompactionCompression());
1776   }
1777 
1778   @Override
1779   public CreateTableResponse createTable(RpcController controller, CreateTableRequest req)
1780   throws ServiceException {
1781     HTableDescriptor hTableDescriptor = HTableDescriptor.convert(req.getTableSchema());
1782     byte [][] splitKeys = ProtobufUtil.getSplitKeysArray(req);
1783     try {
1784       createTable(hTableDescriptor,splitKeys);
1785     } catch (IOException ioe) {
1786       throw new ServiceException(ioe);
1787     }
1788     return CreateTableResponse.newBuilder().build();
1789   }
1790 
1791   private HRegionInfo[] getHRegionInfos(HTableDescriptor hTableDescriptor,
1792     byte[][] splitKeys) {
1793     HRegionInfo[] hRegionInfos = null;
1794     if (splitKeys == null || splitKeys.length == 0) {
1795       hRegionInfos = new HRegionInfo[]{
1796           new HRegionInfo(hTableDescriptor.getTableName(), null, null)};
1797     } else {
1798       int numRegions = splitKeys.length + 1;
1799       hRegionInfos = new HRegionInfo[numRegions];
1800       byte[] startKey = null;
1801       byte[] endKey = null;
1802       for (int i = 0; i < numRegions; i++) {
1803         endKey = (i == splitKeys.length) ? null : splitKeys[i];
1804         hRegionInfos[i] =
1805             new HRegionInfo(hTableDescriptor.getTableName(), startKey, endKey);
1806         startKey = endKey;
1807       }
1808     }
1809     return hRegionInfos;
1810   }
1811 
1812   private static boolean isCatalogTable(final TableName tableName) {
1813     return tableName.equals(TableName.META_TABLE_NAME);
1814   }
1815 
1816   @Override
1817   public void deleteTable(final TableName tableName) throws IOException {
1818     checkInitialized();
1819     if (cpHost != null) {
1820       cpHost.preDeleteTable(tableName);
1821     }
1822     LOG.info(getClientIdAuditPrefix() + " delete " + tableName);
1823     this.executorService.submit(new DeleteTableHandler(tableName, this, this).prepare());
1824     if (cpHost != null) {
1825       cpHost.postDeleteTable(tableName);
1826     }
1827   }
1828 
1829   @Override
1830   public DeleteTableResponse deleteTable(RpcController controller, DeleteTableRequest request)
1831   throws ServiceException {
1832     try {
1833       deleteTable(ProtobufUtil.toTableName(request.getTableName()));
1834     } catch (IOException ioe) {
1835       throw new ServiceException(ioe);
1836     }
1837     return DeleteTableResponse.newBuilder().build();
1838   }
1839 
1840   /**
1841    * Get the number of regions of the table that have been updated by the alter.
1842    *
1843    * @return Pair indicating the number of regions updated Pair.getFirst is the
1844    *         regions that are yet to be updated Pair.getSecond is the total number
1845    *         of regions of the table
1846    * @throws IOException
1847    */
1848   @Override
1849   public GetSchemaAlterStatusResponse getSchemaAlterStatus(
1850       RpcController controller, GetSchemaAlterStatusRequest req) throws ServiceException {
1851     // TODO: currently, we query using the table name on the client side. this
1852     // may overlap with other table operations or the table operation may
1853     // have completed before querying this API. We need to refactor to a
1854     // transaction system in the future to avoid these ambiguities.
1855     TableName tableName = ProtobufUtil.toTableName(req.getTableName());
1856 
1857     try {
1858       Pair<Integer,Integer> pair = this.assignmentManager.getReopenStatus(tableName);
1859       GetSchemaAlterStatusResponse.Builder ret = GetSchemaAlterStatusResponse.newBuilder();
1860       ret.setYetToUpdateRegions(pair.getFirst());
1861       ret.setTotalRegions(pair.getSecond());
1862       return ret.build();
1863     } catch (IOException ioe) {
1864       throw new ServiceException(ioe);
1865     }
1866   }
1867 
1868   @Override
1869   public void addColumn(final TableName tableName, final HColumnDescriptor column)
1870       throws IOException {
1871     checkInitialized();
1872     if (cpHost != null) {
1873       if (cpHost.preAddColumn(tableName, column)) {
1874         return;
1875       }
1876     }
1877     //TODO: we should process this (and some others) in an executor
1878     new TableAddFamilyHandler(tableName, column, this, this).prepare().process();
1879     if (cpHost != null) {
1880       cpHost.postAddColumn(tableName, column);
1881     }
1882   }
1883 
1884   @Override
1885   public AddColumnResponse addColumn(RpcController controller, AddColumnRequest req)
1886   throws ServiceException {
1887     try {
1888       addColumn(ProtobufUtil.toTableName(req.getTableName()),
1889         HColumnDescriptor.convert(req.getColumnFamilies()));
1890     } catch (IOException ioe) {
1891       throw new ServiceException(ioe);
1892     }
1893     return AddColumnResponse.newBuilder().build();
1894   }
1895 
1896   @Override
1897   public void modifyColumn(TableName tableName, HColumnDescriptor descriptor)
1898       throws IOException {
1899     checkInitialized();
1900     checkCompression(descriptor);
1901     if (cpHost != null) {
1902       if (cpHost.preModifyColumn(tableName, descriptor)) {
1903         return;
1904       }
1905     }
1906     LOG.info(getClientIdAuditPrefix() + " modify " + descriptor);
1907     new TableModifyFamilyHandler(tableName, descriptor, this, this)
1908       .prepare().process();
1909     if (cpHost != null) {
1910       cpHost.postModifyColumn(tableName, descriptor);
1911     }
1912   }
1913 
1914   @Override
1915   public ModifyColumnResponse modifyColumn(RpcController controller, ModifyColumnRequest req)
1916   throws ServiceException {
1917     try {
1918       modifyColumn(ProtobufUtil.toTableName(req.getTableName()),
1919         HColumnDescriptor.convert(req.getColumnFamilies()));
1920     } catch (IOException ioe) {
1921       throw new ServiceException(ioe);
1922     }
1923     return ModifyColumnResponse.newBuilder().build();
1924   }
1925 
1926   @Override
1927   public void deleteColumn(final TableName tableName, final byte[] columnName)
1928       throws IOException {
1929     checkInitialized();
1930     if (cpHost != null) {
1931       if (cpHost.preDeleteColumn(tableName, columnName)) {
1932         return;
1933       }
1934     }
1935     LOG.info(getClientIdAuditPrefix() + " delete " + Bytes.toString(columnName));
1936     new TableDeleteFamilyHandler(tableName, columnName, this, this).prepare().process();
1937     if (cpHost != null) {
1938       cpHost.postDeleteColumn(tableName, columnName);
1939     }
1940   }
1941 
1942   @Override
1943   public DeleteColumnResponse deleteColumn(RpcController controller, DeleteColumnRequest req)
1944   throws ServiceException {
1945     try {
1946       deleteColumn(ProtobufUtil.toTableName(req.getTableName()),
1947           req.getColumnName().toByteArray());
1948     } catch (IOException ioe) {
1949       throw new ServiceException(ioe);
1950     }
1951     return DeleteColumnResponse.newBuilder().build();
1952   }
1953 
1954   @Override
1955   public void enableTable(final TableName tableName) throws IOException {
1956     checkInitialized();
1957     if (cpHost != null) {
1958       cpHost.preEnableTable(tableName);
1959     }
1960     LOG.info(getClientIdAuditPrefix() + " enable " + tableName);
1961     this.executorService.submit(new EnableTableHandler(this, tableName,
1962       catalogTracker, assignmentManager, tableLockManager, false).prepare());
1963     if (cpHost != null) {
1964       cpHost.postEnableTable(tableName);
1965    }
1966   }
1967 
1968   @Override
1969   public EnableTableResponse enableTable(RpcController controller, EnableTableRequest request)
1970   throws ServiceException {
1971     try {
1972       enableTable(ProtobufUtil.toTableName(request.getTableName()));
1973     } catch (IOException ioe) {
1974       throw new ServiceException(ioe);
1975     }
1976     return EnableTableResponse.newBuilder().build();
1977   }
1978 
1979   @Override
1980   public void disableTable(final TableName tableName) throws IOException {
1981     checkInitialized();
1982     if (cpHost != null) {
1983       cpHost.preDisableTable(tableName);
1984     }
1985     LOG.info(getClientIdAuditPrefix() + " disable " + tableName);
1986     this.executorService.submit(new DisableTableHandler(this, tableName,
1987       catalogTracker, assignmentManager, tableLockManager, false).prepare());
1988     if (cpHost != null) {
1989       cpHost.postDisableTable(tableName);
1990     }
1991   }
1992 
1993   @Override
1994   public DisableTableResponse disableTable(RpcController controller, DisableTableRequest request)
1995   throws ServiceException {
1996     try {
1997       disableTable(ProtobufUtil.toTableName(request.getTableName()));
1998     } catch (IOException ioe) {
1999       throw new ServiceException(ioe);
2000     }
2001     return DisableTableResponse.newBuilder().build();
2002   }
2003 
2004   /**
2005    * Return the region and current deployment for the region containing
2006    * the given row. If the region cannot be found, returns null. If it
2007    * is found, but not currently deployed, the second element of the pair
2008    * may be null.
2009    */
2010   Pair<HRegionInfo, ServerName> getTableRegionForRow(
2011       final TableName tableName, final byte [] rowKey)
2012   throws IOException {
2013     final AtomicReference<Pair<HRegionInfo, ServerName>> result =
2014       new AtomicReference<Pair<HRegionInfo, ServerName>>(null);
2015 
2016     MetaScannerVisitor visitor =
2017       new MetaScannerVisitorBase() {
2018         @Override
2019         public boolean processRow(Result data) throws IOException {
2020           if (data == null || data.size() <= 0) {
2021             return true;
2022           }
2023           Pair<HRegionInfo, ServerName> pair = HRegionInfo.getHRegionInfoAndServerName(data);
2024           if (pair == null) {
2025             return false;
2026           }
2027           if (!pair.getFirst().getTable().equals(tableName)) {
2028             return false;
2029           }
2030           result.set(pair);
2031           return true;
2032         }
2033     };
2034 
2035     MetaScanner.metaScan(conf, visitor, tableName, rowKey, 1);
2036     return result.get();
2037   }
2038 
2039   @Override
2040   public void modifyTable(final TableName tableName, final HTableDescriptor descriptor)
2041       throws IOException {
2042     checkInitialized();
2043     checkCompression(descriptor);
2044     if (cpHost != null) {
2045       cpHost.preModifyTable(tableName, descriptor);
2046     }
2047     LOG.info(getClientIdAuditPrefix() + " modify " + tableName);
2048     new ModifyTableHandler(tableName, descriptor, this, this).prepare().process();
2049     if (cpHost != null) {
2050       cpHost.postModifyTable(tableName, descriptor);
2051     }
2052   }
2053 
2054   @Override
2055   public ModifyTableResponse modifyTable(RpcController controller, ModifyTableRequest req)
2056   throws ServiceException {
2057     try {
2058       modifyTable(ProtobufUtil.toTableName(req.getTableName()),
2059         HTableDescriptor.convert(req.getTableSchema()));
2060     } catch (IOException ioe) {
2061       throw new ServiceException(ioe);
2062     }
2063     return ModifyTableResponse.newBuilder().build();
2064   }
2065 
2066   @Override
2067   public void checkTableModifiable(final TableName tableName)
2068       throws IOException, TableNotFoundException, TableNotDisabledException {
2069     if (isCatalogTable(tableName)) {
2070       throw new IOException("Can't modify catalog tables");
2071     }
2072     if (!MetaReader.tableExists(getCatalogTracker(), tableName)) {
2073       throw new TableNotFoundException(tableName);
2074     }
2075     if (!getAssignmentManager().getZKTable().
2076         isDisabledTable(tableName)) {
2077       throw new TableNotDisabledException(tableName);
2078     }
2079   }
2080 
2081   @Override
2082   public GetClusterStatusResponse getClusterStatus(RpcController controller,
2083       GetClusterStatusRequest req)
2084   throws ServiceException {
2085     GetClusterStatusResponse.Builder response = GetClusterStatusResponse.newBuilder();
2086     response.setClusterStatus(getClusterStatus().convert());
2087     return response.build();
2088   }
2089 
2090   /**
2091    * @return cluster status
2092    */
2093   public ClusterStatus getClusterStatus() {
2094     // Build Set of backup masters from ZK nodes
2095     List<String> backupMasterStrings;
2096     try {
2097       backupMasterStrings = ZKUtil.listChildrenNoWatch(this.zooKeeper,
2098         this.zooKeeper.backupMasterAddressesZNode);
2099     } catch (KeeperException e) {
2100       LOG.warn(this.zooKeeper.prefix("Unable to list backup servers"), e);
2101       backupMasterStrings = new ArrayList<String>(0);
2102     }
2103     List<ServerName> backupMasters = new ArrayList<ServerName>(
2104                                           backupMasterStrings.size());
2105     for (String s: backupMasterStrings) {
2106       try {
2107         byte [] bytes =
2108             ZKUtil.getData(this.zooKeeper, ZKUtil.joinZNode(
2109                 this.zooKeeper.backupMasterAddressesZNode, s));
2110         if (bytes != null) {
2111           ServerName sn;
2112           try {
2113             sn = ServerName.parseFrom(bytes);
2114           } catch (DeserializationException e) {
2115             LOG.warn("Failed parse, skipping registering backup server", e);
2116             continue;
2117           }
2118           backupMasters.add(sn);
2119         }
2120       } catch (KeeperException e) {
2121         LOG.warn(this.zooKeeper.prefix("Unable to get information about " +
2122                  "backup servers"), e);
2123       }
2124     }
2125     Collections.sort(backupMasters, new Comparator<ServerName>() {
2126       @Override
2127       public int compare(ServerName s1, ServerName s2) {
2128         return s1.getServerName().compareTo(s2.getServerName());
2129       }});
2130 
2131     return new ClusterStatus(VersionInfo.getVersion(),
2132       this.fileSystemManager.getClusterId().toString(),
2133       this.serverManager.getOnlineServers(),
2134       this.serverManager.getDeadServers().copyServerNames(),
2135       this.serverName,
2136       backupMasters,
2137       this.assignmentManager.getRegionStates().getRegionsInTransition(),
2138       this.getCoprocessors(), this.loadBalancerTracker.isBalancerOn());
2139   }
2140 
2141   public String getClusterId() {
2142     if (fileSystemManager == null) {
2143       return "";
2144     }
2145     ClusterId id = fileSystemManager.getClusterId();
2146     if (id == null) {
2147       return "";
2148     }
2149     return id.toString();
2150   }
2151 
2152   /**
2153    * The set of loaded coprocessors is stored in a static set. Since it's
2154    * statically allocated, it does not require that HMaster's cpHost be
2155    * initialized prior to accessing it.
2156    * @return a String representation of the set of names of the loaded
2157    * coprocessors.
2158    */
2159   public static String getLoadedCoprocessors() {
2160     return CoprocessorHost.getLoadedCoprocessors().toString();
2161   }
2162 
2163   /**
2164    * @return timestamp in millis when HMaster was started.
2165    */
2166   public long getMasterStartTime() {
2167     return masterStartTime;
2168   }
2169 
2170   /**
2171    * @return timestamp in millis when HMaster became the active master.
2172    */
2173   public long getMasterActiveTime() {
2174     return masterActiveTime;
2175   }
2176 
2177   public int getRegionServerInfoPort(final ServerName sn) {
2178     RegionServerInfo info = this.regionServerTracker.getRegionServerInfo(sn);
2179     if (info == null || info.getInfoPort() == 0) {
2180       return conf.getInt(HConstants.REGIONSERVER_INFO_PORT,
2181         HConstants.DEFAULT_REGIONSERVER_INFOPORT);
2182     }
2183     return info.getInfoPort();
2184   }
2185 
2186   /**
2187    * @return array of coprocessor SimpleNames.
2188    */
2189   public String[] getCoprocessors() {
2190     Set<String> masterCoprocessors =
2191         getCoprocessorHost().getCoprocessors();
2192     return masterCoprocessors.toArray(new String[masterCoprocessors.size()]);
2193   }
2194 
2195   @Override
2196   public void abort(final String msg, final Throwable t) {
2197     if (cpHost != null) {
2198       // HBASE-4014: dump a list of loaded coprocessors.
2199       LOG.fatal("Master server abort: loaded coprocessors are: " +
2200           getLoadedCoprocessors());
2201     }
2202 
2203     if (abortNow(msg, t)) {
2204       if (t != null) LOG.fatal(msg, t);
2205       else LOG.fatal(msg);
2206       this.abort = true;
2207       stop("Aborting");
2208     }
2209   }
2210 
2211   /**
2212    * We do the following in a different thread.  If it is not completed
2213    * in time, we will time it out and assume it is not easy to recover.
2214    *
2215    * 1. Create a new ZK session. (since our current one is expired)
2216    * 2. Try to become a primary master again
2217    * 3. Initialize all ZK based system trackers.
2218    * 4. Assign meta. (they are already assigned, but we need to update our
2219    * internal memory state to reflect it)
2220    * 5. Process any RIT if any during the process of our recovery.
2221    *
2222    * @return True if we could successfully recover from ZK session expiry.
2223    * @throws InterruptedException
2224    * @throws IOException
2225    * @throws KeeperException
2226    * @throws ExecutionException
2227    */
2228   private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
2229       IOException, KeeperException, ExecutionException {
2230 
2231     this.zooKeeper.unregisterAllListeners();
2232     // add back listeners which were registered before master initialization
2233     // because they won't be added back in below Master re-initialization code
2234     if (this.registeredZKListenersBeforeRecovery != null) {
2235       for (ZooKeeperListener curListener : this.registeredZKListenersBeforeRecovery) {
2236         this.zooKeeper.registerListener(curListener);
2237       }
2238     }
2239 
2240     this.zooKeeper.reconnectAfterExpiration();
2241 
2242     Callable<Boolean> callable = new Callable<Boolean> () {
2243       @Override
2244       public Boolean call() throws InterruptedException,
2245           IOException, KeeperException {
2246         MonitoredTask status =
2247           TaskMonitor.get().createStatus("Recovering expired ZK session");
2248         try {
2249           if (!becomeActiveMaster(status)) {
2250             return Boolean.FALSE;
2251           }
2252           serverShutdownHandlerEnabled = false;
2253           initialized = false;
2254           finishInitialization(status, true);
2255           return !stopped;
2256         } finally {
2257           status.cleanup();
2258         }
2259       }
2260     };
2261 
2262     long timeout =
2263       conf.getLong("hbase.master.zksession.recover.timeout", 300000);
2264     java.util.concurrent.ExecutorService executor =
2265       Executors.newSingleThreadExecutor();
2266     Future<Boolean> result = executor.submit(callable);
2267     executor.shutdown();
2268     if (executor.awaitTermination(timeout, TimeUnit.MILLISECONDS)
2269         && result.isDone()) {
2270       Boolean recovered = result.get();
2271       if (recovered != null) {
2272         return recovered.booleanValue();
2273       }
2274     }
2275     executor.shutdownNow();
2276     return false;
2277   }
2278 
2279   /**
2280    * Check to see if the current trigger for abort is due to ZooKeeper session
2281    * expiry, and If yes, whether we can recover from ZK session expiry.
2282    *
2283    * @param msg Original abort message
2284    * @param t   The cause for current abort request
2285    * @return true if we should proceed with abort operation, false other wise.
2286    */
2287   private boolean abortNow(final String msg, final Throwable t) {
2288     if (!this.isActiveMaster || this.stopped) {
2289       return true;
2290     }
2291 
2292     boolean failFast = conf.getBoolean("fail.fast.expired.active.master", false);
2293     if (t != null && t instanceof KeeperException.SessionExpiredException
2294         && !failFast) {
2295       try {
2296         LOG.info("Primary Master trying to recover from ZooKeeper session " +
2297             "expiry.");
2298         return !tryRecoveringExpiredZKSession();
2299       } catch (Throwable newT) {
2300         LOG.error("Primary master encountered unexpected exception while " +
2301             "trying to recover from ZooKeeper session" +
2302             " expiry. Proceeding with server abort.", newT);
2303       }
2304     }
2305     return true;
2306   }
2307 
2308   @Override
2309   public ZooKeeperWatcher getZooKeeper() {
2310     return zooKeeper;
2311   }
2312 
2313   @Override
2314   public MasterCoprocessorHost getCoprocessorHost() {
2315     return cpHost;
2316   }
2317 
2318   @Override
2319   public ServerName getServerName() {
2320     return this.serverName;
2321   }
2322 
2323   @Override
2324   public CatalogTracker getCatalogTracker() {
2325     return catalogTracker;
2326   }
2327 
2328   @Override
2329   public AssignmentManager getAssignmentManager() {
2330     return this.assignmentManager;
2331   }
2332 
2333   @Override
2334   public TableLockManager getTableLockManager() {
2335     return this.tableLockManager;
2336   }
2337 
2338   public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
2339     return rsFatals;
2340   }
2341 
2342   public void shutdown() {
2343     if (spanReceiverHost != null) {
2344       spanReceiverHost.closeReceivers();
2345     }
2346     if (cpHost != null) {
2347       try {
2348         cpHost.preShutdown();
2349       } catch (IOException ioe) {
2350         LOG.error("Error call master coprocessor preShutdown()", ioe);
2351       }
2352     }
2353     if (mxBean != null) {
2354       MBeanUtil.unregisterMBean(mxBean);
2355       mxBean = null;
2356     }
2357     if (this.assignmentManager != null) this.assignmentManager.shutdown();
2358     if (this.serverManager != null) this.serverManager.shutdownCluster();
2359     try {
2360       if (this.clusterStatusTracker != null){
2361         this.clusterStatusTracker.setClusterDown();
2362       }
2363     } catch (KeeperException e) {
2364       LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
2365     }
2366   }
2367 
2368   @Override
2369   public ShutdownResponse shutdown(RpcController controller, ShutdownRequest request)
2370   throws ServiceException {
2371     LOG.info(getClientIdAuditPrefix() + " shutdown");
2372     shutdown();
2373     return ShutdownResponse.newBuilder().build();
2374   }
2375 
2376   public void stopMaster() {
2377     if (cpHost != null) {
2378       try {
2379         cpHost.preStopMaster();
2380       } catch (IOException ioe) {
2381         LOG.error("Error call master coprocessor preStopMaster()", ioe);
2382       }
2383     }
2384     stop("Stopped by " + Thread.currentThread().getName());
2385   }
2386 
2387   @Override
2388   public StopMasterResponse stopMaster(RpcController controller, StopMasterRequest request)
2389   throws ServiceException {
2390     LOG.info(getClientIdAuditPrefix() + " stop");
2391     stopMaster();
2392     return StopMasterResponse.newBuilder().build();
2393   }
2394 
2395   @Override
2396   public void stop(final String why) {
2397     LOG.info(why);
2398     this.stopped = true;
2399     // We wake up the stopSleeper to stop immediately
2400     stopSleeper.skipSleepCycle();
2401     // If we are a backup master, we need to interrupt wait
2402     if (this.activeMasterManager != null) {
2403       synchronized (this.activeMasterManager.clusterHasActiveMaster) {
2404         this.activeMasterManager.clusterHasActiveMaster.notifyAll();
2405       }
2406     }
2407     // If no region server is online then master may stuck waiting on hbase:meta to come on line.
2408     // See HBASE-8422.
2409     if (this.catalogTracker != null && this.serverManager.getOnlineServers().isEmpty()) {
2410       this.catalogTracker.stop();
2411     }
2412   }
2413 
2414   @Override
2415   public boolean isStopped() {
2416     return this.stopped;
2417   }
2418 
2419   @Override
2420   public boolean isAborted() {
2421     return this.abort;
2422   }
2423 
2424   void checkInitialized() throws PleaseHoldException {
2425     if (!this.initialized) {
2426       throw new PleaseHoldException("Master is initializing");
2427     }
2428   }
2429 
2430   /**
2431    * Report whether this master is currently the active master or not.
2432    * If not active master, we are parked on ZK waiting to become active.
2433    *
2434    * This method is used for testing.
2435    *
2436    * @return true if active master, false if not.
2437    */
2438   public boolean isActiveMaster() {
2439     return isActiveMaster;
2440   }
2441 
2442   /**
2443    * Report whether this master has completed with its initialization and is
2444    * ready.  If ready, the master is also the active master.  A standby master
2445    * is never ready.
2446    *
2447    * This method is used for testing.
2448    *
2449    * @return true if master is ready to go, false if not.
2450    */
2451   @Override
2452   public boolean isInitialized() {
2453     return initialized;
2454   }
2455 
2456   /**
2457    * ServerShutdownHandlerEnabled is set false before completing
2458    * assignMeta to prevent processing of ServerShutdownHandler.
2459    * @return true if assignMeta has completed;
2460    */
2461   @Override
2462   public boolean isServerShutdownHandlerEnabled() {
2463     return this.serverShutdownHandlerEnabled;
2464   }
2465 
2466   /**
2467    * Report whether this master has started initialization and is about to do meta region assignment
2468    * @return true if master is in initialization & about to assign hbase:meta regions
2469    */
2470   public boolean isInitializationStartsMetaRegionAssignment() {
2471     return this.initializationBeforeMetaAssignment;
2472   }
2473 
2474   @Override
2475   public AssignRegionResponse assignRegion(RpcController controller, AssignRegionRequest req)
2476   throws ServiceException {
2477     try {
2478       final byte [] regionName = req.getRegion().getValue().toByteArray();
2479       RegionSpecifierType type = req.getRegion().getType();
2480       AssignRegionResponse arr = AssignRegionResponse.newBuilder().build();
2481 
2482       checkInitialized();
2483       if (type != RegionSpecifierType.REGION_NAME) {
2484         LOG.warn("assignRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2485           + " actual: " + type);
2486       }
2487       HRegionInfo regionInfo = assignmentManager.getRegionStates().getRegionInfo(regionName);
2488       if (regionInfo == null) throw new UnknownRegionException(Bytes.toString(regionName));
2489       if (cpHost != null) {
2490         if (cpHost.preAssign(regionInfo)) {
2491           return arr;
2492         }
2493       }
2494       LOG.info(getClientIdAuditPrefix() + " assign " + regionInfo.getRegionNameAsString());
2495       assignmentManager.assign(regionInfo, true, true);
2496       if (cpHost != null) {
2497         cpHost.postAssign(regionInfo);
2498       }
2499 
2500       return arr;
2501     } catch (IOException ioe) {
2502       throw new ServiceException(ioe);
2503     }
2504   }
2505 
2506   public void assignRegion(HRegionInfo hri) {
2507     assignmentManager.assign(hri, true);
2508   }
2509 
2510   @Override
2511   public UnassignRegionResponse unassignRegion(RpcController controller, UnassignRegionRequest req)
2512   throws ServiceException {
2513     try {
2514       final byte [] regionName = req.getRegion().getValue().toByteArray();
2515       RegionSpecifierType type = req.getRegion().getType();
2516       final boolean force = req.getForce();
2517       UnassignRegionResponse urr = UnassignRegionResponse.newBuilder().build();
2518 
2519       checkInitialized();
2520       if (type != RegionSpecifierType.REGION_NAME) {
2521         LOG.warn("unassignRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2522           + " actual: " + type);
2523       }
2524       Pair<HRegionInfo, ServerName> pair =
2525         MetaReader.getRegion(this.catalogTracker, regionName);
2526       if (pair == null) throw new UnknownRegionException(Bytes.toString(regionName));
2527       HRegionInfo hri = pair.getFirst();
2528       if (cpHost != null) {
2529         if (cpHost.preUnassign(hri, force)) {
2530           return urr;
2531         }
2532       }
2533       LOG.debug(getClientIdAuditPrefix() + " unassign " + hri.getRegionNameAsString()
2534           + " in current location if it is online and reassign.force=" + force);
2535       this.assignmentManager.unassign(hri, force);
2536       if (this.assignmentManager.getRegionStates().isRegionOffline(hri)) {
2537         LOG.debug("Region " + hri.getRegionNameAsString()
2538             + " is not online on any region server, reassigning it.");
2539         assignRegion(hri);
2540       }
2541       if (cpHost != null) {
2542         cpHost.postUnassign(hri, force);
2543       }
2544 
2545       return urr;
2546     } catch (IOException ioe) {
2547       throw new ServiceException(ioe);
2548     }
2549   }
2550 
2551   /**
2552    * Get list of TableDescriptors for requested tables.
2553    * @param controller Unused (set to null).
2554    * @param req GetTableDescriptorsRequest that contains:
2555    * - tableNames: requested tables, or if empty, all are requested
2556    * @return GetTableDescriptorsResponse
2557    * @throws ServiceException
2558    */
2559   @Override
2560   public GetTableDescriptorsResponse getTableDescriptors(
2561 	      RpcController controller, GetTableDescriptorsRequest req) throws ServiceException {
2562     List<HTableDescriptor> descriptors = new ArrayList<HTableDescriptor>();
2563     List<TableName> tableNameList = new ArrayList<TableName>();
2564     for(HBaseProtos.TableName tableNamePB: req.getTableNamesList()) {
2565       tableNameList.add(ProtobufUtil.toTableName(tableNamePB));
2566     }
2567     boolean bypass = false;
2568     if (this.cpHost != null) {
2569       try {
2570         bypass = this.cpHost.preGetTableDescriptors(tableNameList, descriptors);
2571       } catch (IOException ioe) {
2572         throw new ServiceException(ioe);
2573       }
2574     }
2575 
2576     if (!bypass) {
2577       if (req.getTableNamesCount() == 0) {
2578         // request for all TableDescriptors
2579         Map<String, HTableDescriptor> descriptorMap = null;
2580         try {
2581           descriptorMap = this.tableDescriptors.getAll();
2582         } catch (IOException e) {
2583           LOG.warn("Failed getting all descriptors", e);
2584         }
2585         if (descriptorMap != null) {
2586           for(HTableDescriptor desc: descriptorMap.values()) {
2587             if(!desc.getTableName().isSystemTable()) {
2588               descriptors.add(desc);
2589             }
2590           }
2591         }
2592       } else {
2593         for (TableName s: tableNameList) {
2594           try {
2595             HTableDescriptor desc = this.tableDescriptors.get(s);
2596             if (desc != null) {
2597               descriptors.add(desc);
2598             }
2599           } catch (IOException e) {
2600             LOG.warn("Failed getting descriptor for " + s, e);
2601           }
2602         }
2603       }
2604 
2605       if (this.cpHost != null) {
2606         try {
2607           this.cpHost.postGetTableDescriptors(descriptors);
2608         } catch (IOException ioe) {
2609           throw new ServiceException(ioe);
2610         }
2611       }
2612     }
2613 
2614     GetTableDescriptorsResponse.Builder builder = GetTableDescriptorsResponse.newBuilder();
2615     for (HTableDescriptor htd: descriptors) {
2616       builder.addTableSchema(htd.convert());
2617     }
2618     return builder.build();
2619   }
2620 
2621   /**
2622    * Get list of userspace table names
2623    * @param controller Unused (set to null).
2624    * @param req GetTableNamesRequest
2625    * @return GetTableNamesResponse
2626    * @throws ServiceException
2627    */
2628   @Override
2629   public GetTableNamesResponse getTableNames(
2630         RpcController controller, GetTableNamesRequest req) throws ServiceException {
2631     try {
2632       Collection<HTableDescriptor> descriptors = this.tableDescriptors.getAll().values();
2633       GetTableNamesResponse.Builder builder = GetTableNamesResponse.newBuilder();
2634       for (HTableDescriptor descriptor: descriptors) {
2635         if (descriptor.getTableName().isSystemTable()) {
2636           continue;
2637         }
2638         builder.addTableNames(ProtobufUtil.toProtoTableName(descriptor.getTableName()));
2639       }
2640       return builder.build();
2641     } catch (IOException e) {
2642       throw new ServiceException(e);
2643     }
2644   }
2645 
2646   /**
2647    * Compute the average load across all region servers.
2648    * Currently, this uses a very naive computation - just uses the number of
2649    * regions being served, ignoring stats about number of requests.
2650    * @return the average load
2651    */
2652   public double getAverageLoad() {
2653     if (this.assignmentManager == null) {
2654       return 0;
2655     }
2656 
2657     RegionStates regionStates = this.assignmentManager.getRegionStates();
2658     if (regionStates == null) {
2659       return 0;
2660     }
2661     return regionStates.getAverageLoad();
2662   }
2663 
2664   /**
2665    * Offline specified region from master's in-memory state. It will not attempt to
2666    * reassign the region as in unassign.
2667    *
2668    * This is a special method that should be used by experts or hbck.
2669    *
2670    */
2671   @Override
2672   public OfflineRegionResponse offlineRegion(RpcController controller, OfflineRegionRequest request)
2673   throws ServiceException {
2674     final byte [] regionName = request.getRegion().getValue().toByteArray();
2675     RegionSpecifierType type = request.getRegion().getType();
2676     if (type != RegionSpecifierType.REGION_NAME) {
2677       LOG.warn("moveRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2678         + " actual: " + type);
2679     }
2680 
2681     try {
2682       Pair<HRegionInfo, ServerName> pair =
2683         MetaReader.getRegion(this.catalogTracker, regionName);
2684       if (pair == null) throw new UnknownRegionException(Bytes.toStringBinary(regionName));
2685       HRegionInfo hri = pair.getFirst();
2686       if (cpHost != null) {
2687         cpHost.preRegionOffline(hri);
2688       }
2689       LOG.info(getClientIdAuditPrefix() + " offline " + hri.getRegionNameAsString());
2690       this.assignmentManager.regionOffline(hri);
2691       if (cpHost != null) {
2692         cpHost.postRegionOffline(hri);
2693       }
2694     } catch (IOException ioe) {
2695       throw new ServiceException(ioe);
2696     }
2697     return OfflineRegionResponse.newBuilder().build();
2698   }
2699 
2700   @Override
2701   public boolean registerService(Service instance) {
2702     /*
2703      * No stacking of instances is allowed for a single service name
2704      */
2705     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
2706     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
2707       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
2708           " already registered, rejecting request from "+instance
2709       );
2710       return false;
2711     }
2712 
2713     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
2714     if (LOG.isDebugEnabled()) {
2715       LOG.debug("Registered master coprocessor service: service="+serviceDesc.getFullName());
2716     }
2717     return true;
2718   }
2719 
2720   @Override
2721   public ClientProtos.CoprocessorServiceResponse execMasterService(final RpcController controller,
2722       final ClientProtos.CoprocessorServiceRequest request) throws ServiceException {
2723     try {
2724       ServerRpcController execController = new ServerRpcController();
2725 
2726       ClientProtos.CoprocessorServiceCall call = request.getCall();
2727       String serviceName = call.getServiceName();
2728       String methodName = call.getMethodName();
2729       if (!coprocessorServiceHandlers.containsKey(serviceName)) {
2730         throw new UnknownProtocolException(null,
2731             "No registered master coprocessor service found for name "+serviceName);
2732       }
2733 
2734       Service service = coprocessorServiceHandlers.get(serviceName);
2735       Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
2736       Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
2737       if (methodDesc == null) {
2738         throw new UnknownProtocolException(service.getClass(),
2739             "Unknown method "+methodName+" called on master service "+serviceName);
2740       }
2741 
2742       //invoke the method
2743       Message execRequest = service.getRequestPrototype(methodDesc).newBuilderForType()
2744           .mergeFrom(call.getRequest()).build();
2745       final Message.Builder responseBuilder =
2746           service.getResponsePrototype(methodDesc).newBuilderForType();
2747       service.callMethod(methodDesc, execController, execRequest, new RpcCallback<Message>() {
2748         @Override
2749         public void run(Message message) {
2750           if (message != null) {
2751             responseBuilder.mergeFrom(message);
2752           }
2753         }
2754       });
2755       Message execResult = responseBuilder.build();
2756 
2757       if (execController.getFailedOn() != null) {
2758         throw execController.getFailedOn();
2759       }
2760       ClientProtos.CoprocessorServiceResponse.Builder builder =
2761           ClientProtos.CoprocessorServiceResponse.newBuilder();
2762       builder.setRegion(RequestConverter.buildRegionSpecifier(
2763           RegionSpecifierType.REGION_NAME, HConstants.EMPTY_BYTE_ARRAY));
2764       builder.setValue(
2765           builder.getValueBuilder().setName(execResult.getClass().getName())
2766               .setValue(execResult.toByteString()));
2767       return builder.build();
2768     } catch (IOException ie) {
2769       throw new ServiceException(ie);
2770     }
2771   }
2772 
2773   /**
2774    * Utility for constructing an instance of the passed HMaster class.
2775    * @param masterClass
2776    * @param conf
2777    * @return HMaster instance.
2778    */
2779   public static HMaster constructMaster(Class<? extends HMaster> masterClass,
2780       final Configuration conf)  {
2781     try {
2782       Constructor<? extends HMaster> c =
2783         masterClass.getConstructor(Configuration.class);
2784       return c.newInstance(conf);
2785     } catch (InvocationTargetException ite) {
2786       Throwable target = ite.getTargetException() != null?
2787         ite.getTargetException(): ite;
2788       if (target.getCause() != null) target = target.getCause();
2789       throw new RuntimeException("Failed construction of Master: " +
2790         masterClass.toString(), target);
2791     } catch (Exception e) {
2792       throw new RuntimeException("Failed construction of Master: " +
2793         masterClass.toString() + ((e.getCause() != null)?
2794           e.getCause().getMessage(): ""), e);
2795     }
2796   }
2797 
2798   /**
2799    * @see org.apache.hadoop.hbase.master.HMasterCommandLine
2800    */
2801   public static void main(String [] args) {
2802     VersionInfo.logVersion();
2803     new HMasterCommandLine(HMaster.class).doMain(args);
2804   }
2805 
2806   public HFileCleaner getHFileCleaner() {
2807     return this.hfileCleaner;
2808   }
2809 
2810   /**
2811    * Exposed for TESTING!
2812    * @return the underlying snapshot manager
2813    */
2814   public SnapshotManager getSnapshotManagerForTesting() {
2815     return this.snapshotManager;
2816   }
2817 
2818   /**
2819    * Triggers an asynchronous attempt to take a snapshot.
2820    * {@inheritDoc}
2821    */
2822   @Override
2823   public SnapshotResponse snapshot(RpcController controller, SnapshotRequest request)
2824       throws ServiceException {
2825     try {
2826       this.snapshotManager.checkSnapshotSupport();
2827     } catch (UnsupportedOperationException e) {
2828       throw new ServiceException(e);
2829     }
2830 
2831     LOG.info(getClientIdAuditPrefix() + " snapshot request for:" +
2832         ClientSnapshotDescriptionUtils.toString(request.getSnapshot()));
2833     // get the snapshot information
2834     SnapshotDescription snapshot = SnapshotDescriptionUtils.validate(request.getSnapshot(),
2835       this.conf);
2836     try {
2837       snapshotManager.takeSnapshot(snapshot);
2838     } catch (IOException e) {
2839       throw new ServiceException(e);
2840     }
2841 
2842     // send back the max amount of time the client should wait for the snapshot to complete
2843     long waitTime = SnapshotDescriptionUtils.getMaxMasterTimeout(conf, snapshot.getType(),
2844       SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME);
2845     return SnapshotResponse.newBuilder().setExpectedTimeout(waitTime).build();
2846   }
2847 
2848   /**
2849    * List the currently available/stored snapshots. Any in-progress snapshots are ignored
2850    */
2851   @Override
2852   public GetCompletedSnapshotsResponse getCompletedSnapshots(RpcController controller,
2853       GetCompletedSnapshotsRequest request) throws ServiceException {
2854     try {
2855       GetCompletedSnapshotsResponse.Builder builder = GetCompletedSnapshotsResponse.newBuilder();
2856       List<SnapshotDescription> snapshots = snapshotManager.getCompletedSnapshots();
2857 
2858       // convert to protobuf
2859       for (SnapshotDescription snapshot : snapshots) {
2860         builder.addSnapshots(snapshot);
2861       }
2862       return builder.build();
2863     } catch (IOException e) {
2864       throw new ServiceException(e);
2865     }
2866   }
2867 
2868   /**
2869    * Execute Delete Snapshot operation.
2870    * @return DeleteSnapshotResponse (a protobuf wrapped void) if the snapshot existed and was
2871    *    deleted properly.
2872    * @throws ServiceException wrapping SnapshotDoesNotExistException if specified snapshot did not
2873    *    exist.
2874    */
2875   @Override
2876   public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2877       DeleteSnapshotRequest request) throws ServiceException {
2878     try {
2879       this.snapshotManager.checkSnapshotSupport();
2880     } catch (UnsupportedOperationException e) {
2881       throw new ServiceException(e);
2882     }
2883 
2884     try {
2885       LOG.info(getClientIdAuditPrefix() + " delete " + request.getSnapshot());
2886       snapshotManager.deleteSnapshot(request.getSnapshot());
2887       return DeleteSnapshotResponse.newBuilder().build();
2888     } catch (IOException e) {
2889       throw new ServiceException(e);
2890     }
2891   }
2892 
2893   /**
2894    * Checks if the specified snapshot is done.
2895    * @return true if the snapshot is in file system ready to use,
2896    *   false if the snapshot is in the process of completing
2897    * @throws ServiceException wrapping UnknownSnapshotException if invalid snapshot, or
2898    *  a wrapped HBaseSnapshotException with progress failure reason.
2899    */
2900   @Override
2901   public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2902       IsSnapshotDoneRequest request) throws ServiceException {
2903     LOG.debug("Checking to see if snapshot from request:" +
2904         ClientSnapshotDescriptionUtils.toString(request.getSnapshot()) + " is done");
2905     try {
2906       IsSnapshotDoneResponse.Builder builder = IsSnapshotDoneResponse.newBuilder();
2907       boolean done = snapshotManager.isSnapshotDone(request.getSnapshot());
2908       builder.setDone(done);
2909       return builder.build();
2910     } catch (IOException e) {
2911       throw new ServiceException(e);
2912     }
2913   }
2914 
2915   /**
2916    * Execute Restore/Clone snapshot operation.
2917    *
2918    * <p>If the specified table exists a "Restore" is executed, replacing the table
2919    * schema and directory data with the content of the snapshot.
2920    * The table must be disabled, or a UnsupportedOperationException will be thrown.
2921    *
2922    * <p>If the table doesn't exist a "Clone" is executed, a new table is created
2923    * using the schema at the time of the snapshot, and the content of the snapshot.
2924    *
2925    * <p>The restore/clone operation does not require copying HFiles. Since HFiles
2926    * are immutable the table can point to and use the same files as the original one.
2927    */
2928   @Override
2929   public RestoreSnapshotResponse restoreSnapshot(RpcController controller,
2930       RestoreSnapshotRequest request) throws ServiceException {
2931     try {
2932       this.snapshotManager.checkSnapshotSupport();
2933     } catch (UnsupportedOperationException e) {
2934       throw new ServiceException(e);
2935     }
2936 
2937     // ensure namespace exists
2938     try {
2939       TableName dstTable = TableName.valueOf(request.getSnapshot().getTable());
2940       getNamespaceDescriptor(dstTable.getNamespaceAsString());
2941     } catch (IOException ioe) {
2942       throw new ServiceException(ioe);
2943     }
2944 
2945     try {
2946       SnapshotDescription reqSnapshot = request.getSnapshot();
2947       snapshotManager.restoreSnapshot(reqSnapshot);
2948       return RestoreSnapshotResponse.newBuilder().build();
2949     } catch (IOException e) {
2950       throw new ServiceException(e);
2951     }
2952   }
2953 
2954   /**
2955    * Returns the status of the requested snapshot restore/clone operation.
2956    * This method is not exposed to the user, it is just used internally by HBaseAdmin
2957    * to verify if the restore is completed.
2958    *
2959    * No exceptions are thrown if the restore is not running, the result will be "done".
2960    *
2961    * @return done <tt>true</tt> if the restore/clone operation is completed.
2962    * @throws ServiceException if the operation failed.
2963    */
2964   @Override
2965   public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(RpcController controller,
2966       IsRestoreSnapshotDoneRequest request) throws ServiceException {
2967     try {
2968       SnapshotDescription snapshot = request.getSnapshot();
2969       IsRestoreSnapshotDoneResponse.Builder builder = IsRestoreSnapshotDoneResponse.newBuilder();
2970       boolean done = snapshotManager.isRestoreDone(snapshot);
2971       builder.setDone(done);
2972       return builder.build();
2973     } catch (IOException e) {
2974       throw new ServiceException(e);
2975     }
2976   }
2977 
2978   /**
2979    * Triggers an asynchronous attempt to run a distributed procedure.
2980    * {@inheritDoc}
2981    */
2982   @Override
2983   public ExecProcedureResponse execProcedure(RpcController controller,
2984       ExecProcedureRequest request) throws ServiceException {
2985     ProcedureDescription desc = request.getProcedure();
2986     MasterProcedureManager mpm = this.mpmHost.getProcedureManager(desc
2987         .getSignature());
2988     if (mpm == null) {
2989       throw new ServiceException("The procedure is not registered: "
2990           + desc.getSignature());
2991     }
2992 
2993     LOG.info(getClientIdAuditPrefix() + " procedure request for: "
2994         + desc.getSignature());
2995 
2996     try {
2997       mpm.execProcedure(desc);
2998     } catch (IOException e) {
2999       throw new ServiceException(e);
3000     }
3001 
3002     // send back the max amount of time the client should wait for the procedure
3003     // to complete
3004     long waitTime = SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME;
3005     return ExecProcedureResponse.newBuilder().setExpectedTimeout(waitTime)
3006         .build();
3007   }
3008 
3009   /**
3010    * Checks if the specified procedure is done.
3011    * @return true if the procedure is done,
3012    *   false if the procedure is in the process of completing
3013    * @throws ServiceException if invalid procedure, or
3014    *  a failed procedure with progress failure reason.
3015    */
3016   @Override
3017   public IsProcedureDoneResponse isProcedureDone(RpcController controller,
3018       IsProcedureDoneRequest request) throws ServiceException {
3019     ProcedureDescription desc = request.getProcedure();
3020     MasterProcedureManager mpm = this.mpmHost.getProcedureManager(desc
3021         .getSignature());
3022     if (mpm == null) {
3023       throw new ServiceException("The procedure is not registered: "
3024           + desc.getSignature());
3025     }
3026     LOG.debug("Checking to see if procedure from request:"
3027         + desc.getSignature() + " is done");
3028 
3029     try {
3030       IsProcedureDoneResponse.Builder builder = IsProcedureDoneResponse
3031           .newBuilder();
3032       boolean done = mpm.isProcedureDone(desc);
3033       builder.setDone(done);
3034       return builder.build();
3035     } catch (IOException e) {
3036       throw new ServiceException(e);
3037     }
3038   }
3039 
3040   @Override
3041   public ModifyNamespaceResponse modifyNamespace(RpcController controller,
3042       ModifyNamespaceRequest request) throws ServiceException {
3043     try {
3044       modifyNamespace(ProtobufUtil.toNamespaceDescriptor(request.getNamespaceDescriptor()));
3045       return ModifyNamespaceResponse.getDefaultInstance();
3046     } catch (IOException e) {
3047       throw new ServiceException(e);
3048     }
3049   }
3050 
3051   @Override
3052   public CreateNamespaceResponse createNamespace(RpcController controller,
3053      CreateNamespaceRequest request) throws ServiceException {
3054     try {
3055       createNamespace(ProtobufUtil.toNamespaceDescriptor(request.getNamespaceDescriptor()));
3056       return CreateNamespaceResponse.getDefaultInstance();
3057     } catch (IOException e) {
3058       throw new ServiceException(e);
3059     }
3060   }
3061 
3062   @Override
3063   public DeleteNamespaceResponse deleteNamespace(RpcController controller,
3064       DeleteNamespaceRequest request) throws ServiceException {
3065     try {
3066       deleteNamespace(request.getNamespaceName());
3067       return DeleteNamespaceResponse.getDefaultInstance();
3068     } catch (IOException e) {
3069       throw new ServiceException(e);
3070     }
3071   }
3072 
3073   @Override
3074   public GetNamespaceDescriptorResponse getNamespaceDescriptor(
3075       RpcController controller, GetNamespaceDescriptorRequest request)
3076       throws ServiceException {
3077     try {
3078       return GetNamespaceDescriptorResponse.newBuilder()
3079           .setNamespaceDescriptor(
3080               ProtobufUtil.toProtoNamespaceDescriptor(getNamespaceDescriptor(request.getNamespaceName())))
3081           .build();
3082     } catch (IOException e) {
3083       throw new ServiceException(e);
3084     }
3085   }
3086 
3087   @Override
3088   public ListNamespaceDescriptorsResponse listNamespaceDescriptors(
3089       RpcController controller, ListNamespaceDescriptorsRequest request)
3090       throws ServiceException {
3091     try {
3092       ListNamespaceDescriptorsResponse.Builder response =
3093           ListNamespaceDescriptorsResponse.newBuilder();
3094       for(NamespaceDescriptor ns: listNamespaceDescriptors()) {
3095         response.addNamespaceDescriptor(ProtobufUtil.toProtoNamespaceDescriptor(ns));
3096       }
3097       return response.build();
3098     } catch (IOException e) {
3099       throw new ServiceException(e);
3100     }
3101   }
3102 
3103   @Override
3104   public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(
3105       RpcController controller, ListTableDescriptorsByNamespaceRequest request)
3106       throws ServiceException {
3107     try {
3108       ListTableDescriptorsByNamespaceResponse.Builder b =
3109           ListTableDescriptorsByNamespaceResponse.newBuilder();
3110       for(HTableDescriptor htd: listTableDescriptorsByNamespace(request.getNamespaceName())) {
3111         b.addTableSchema(htd.convert());
3112       }
3113       return b.build();
3114     } catch (IOException e) {
3115       throw new ServiceException(e);
3116     }
3117   }
3118 
3119   @Override
3120   public ListTableNamesByNamespaceResponse listTableNamesByNamespace(
3121       RpcController controller, ListTableNamesByNamespaceRequest request)
3122       throws ServiceException {
3123     try {
3124       ListTableNamesByNamespaceResponse.Builder b =
3125           ListTableNamesByNamespaceResponse.newBuilder();
3126       for (TableName tableName: listTableNamesByNamespace(request.getNamespaceName())) {
3127         b.addTableName(ProtobufUtil.toProtoTableName(tableName));
3128       }
3129       return b.build();
3130     } catch (IOException e) {
3131       throw new ServiceException(e);
3132     }
3133   }
3134 
3135   private boolean isHealthCheckerConfigured() {
3136     String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
3137     return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
3138   }
3139 
3140   @Override
3141   public void createNamespace(NamespaceDescriptor descriptor) throws IOException {
3142     TableName.isLegalNamespaceName(Bytes.toBytes(descriptor.getName()));
3143     if (cpHost != null) {
3144       if (cpHost.preCreateNamespace(descriptor)) {
3145         return;
3146       }
3147     }
3148     LOG.info(getClientIdAuditPrefix() + " creating " + descriptor);
3149     tableNamespaceManager.create(descriptor);
3150     if (cpHost != null) {
3151       cpHost.postCreateNamespace(descriptor);
3152     }
3153   }
3154 
3155   @Override
3156   public void modifyNamespace(NamespaceDescriptor descriptor) throws IOException {
3157     TableName.isLegalNamespaceName(Bytes.toBytes(descriptor.getName()));
3158     if (cpHost != null) {
3159       if (cpHost.preModifyNamespace(descriptor)) {
3160         return;
3161       }
3162     }
3163     LOG.info(getClientIdAuditPrefix() + " modify " + descriptor);
3164     tableNamespaceManager.update(descriptor);
3165     if (cpHost != null) {
3166       cpHost.postModifyNamespace(descriptor);
3167     }
3168   }
3169 
3170   @Override
3171   public void deleteNamespace(String name) throws IOException {
3172     if (cpHost != null) {
3173       if (cpHost.preDeleteNamespace(name)) {
3174         return;
3175       }
3176     }
3177     LOG.info(getClientIdAuditPrefix() + " delete " + name);
3178     tableNamespaceManager.remove(name);
3179     if (cpHost != null) {
3180       cpHost.postDeleteNamespace(name);
3181     }
3182   }
3183 
3184   @Override
3185   public NamespaceDescriptor getNamespaceDescriptor(String name) throws IOException {
3186     boolean ready = tableNamespaceManager != null &&
3187         tableNamespaceManager.isTableAvailableAndInitialized();
3188     if (!ready) {
3189       throw new IOException("Table Namespace Manager not ready yet, try again later");
3190     }
3191     NamespaceDescriptor nsd = tableNamespaceManager.get(name);
3192     if (nsd == null) {
3193       throw new NamespaceNotFoundException(name);
3194     }
3195     return nsd;
3196   }
3197 
3198   @Override
3199   public List<NamespaceDescriptor> listNamespaceDescriptors() throws IOException {
3200     return Lists.newArrayList(tableNamespaceManager.list());
3201   }
3202 
3203   @Override
3204   public List<HTableDescriptor> listTableDescriptorsByNamespace(String name) throws IOException {
3205     getNamespaceDescriptor(name); // check that namespace exists
3206     return Lists.newArrayList(tableDescriptors.getByNamespace(name).values());
3207   }
3208 
3209   @Override
3210   public List<TableName> listTableNamesByNamespace(String name) throws IOException {
3211     List<TableName> tableNames = Lists.newArrayList();
3212     getNamespaceDescriptor(name); // check that namespace exists
3213     for (HTableDescriptor descriptor: tableDescriptors.getByNamespace(name).values()) {
3214       tableNames.add(descriptor.getTableName());
3215     }
3216     return tableNames;
3217   }
3218 
3219 }