View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.lang.reflect.Constructor;
23  import java.lang.reflect.InvocationTargetException;
24  import java.net.InetAddress;
25  import java.net.InetSocketAddress;
26  import java.net.UnknownHostException;
27  import java.util.ArrayList;
28  import java.util.Collection;
29  import java.util.Collections;
30  import java.util.Comparator;
31  import java.util.HashSet;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.Set;
35  import java.util.concurrent.Callable;
36  import java.util.concurrent.ExecutionException;
37  import java.util.concurrent.Executors;
38  import java.util.concurrent.Future;
39  import java.util.concurrent.TimeUnit;
40  import java.util.concurrent.atomic.AtomicReference;
41  
42  import javax.management.ObjectName;
43  
44  import org.apache.commons.logging.Log;
45  import org.apache.commons.logging.LogFactory;
46  import org.apache.hadoop.classification.InterfaceAudience;
47  import org.apache.hadoop.conf.Configuration;
48  import org.apache.hadoop.fs.Path;
49  import org.apache.hadoop.hbase.Abortable;
50  import org.apache.hadoop.hbase.Chore;
51  import org.apache.hadoop.hbase.ClusterId;
52  import org.apache.hadoop.hbase.ClusterStatus;
53  import org.apache.hadoop.hbase.HBaseIOException;
54  import org.apache.hadoop.hbase.HColumnDescriptor;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HTableDescriptor;
58  import org.apache.hadoop.hbase.HealthCheckChore;
59  import org.apache.hadoop.hbase.MasterNotRunningException;
60  import org.apache.hadoop.hbase.NamespaceDescriptor;
61  import org.apache.hadoop.hbase.NamespaceNotFoundException;
62  import org.apache.hadoop.hbase.PleaseHoldException;
63  import org.apache.hadoop.hbase.Server;
64  import org.apache.hadoop.hbase.ServerLoad;
65  import org.apache.hadoop.hbase.ServerName;
66  import org.apache.hadoop.hbase.TableDescriptors;
67  import org.apache.hadoop.hbase.TableName;
68  import org.apache.hadoop.hbase.TableNotDisabledException;
69  import org.apache.hadoop.hbase.TableNotFoundException;
70  import org.apache.hadoop.hbase.UnknownRegionException;
71  import org.apache.hadoop.hbase.catalog.CatalogTracker;
72  import org.apache.hadoop.hbase.catalog.MetaReader;
73  import org.apache.hadoop.hbase.client.HConnectionManager;
74  import org.apache.hadoop.hbase.client.MetaScanner;
75  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
76  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
77  import org.apache.hadoop.hbase.client.Result;
78  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
79  import org.apache.hadoop.hbase.exceptions.DeserializationException;
80  import org.apache.hadoop.hbase.exceptions.MergeRegionException;
81  import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
82  import org.apache.hadoop.hbase.executor.ExecutorService;
83  import org.apache.hadoop.hbase.executor.ExecutorType;
84  import org.apache.hadoop.hbase.ipc.FifoRpcScheduler;
85  import org.apache.hadoop.hbase.ipc.RequestContext;
86  import org.apache.hadoop.hbase.ipc.RpcServer;
87  import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
88  import org.apache.hadoop.hbase.ipc.RpcServerInterface;
89  import org.apache.hadoop.hbase.ipc.ServerRpcController;
90  import org.apache.hadoop.hbase.master.RegionState.State;
91  import org.apache.hadoop.hbase.master.balancer.BalancerChore;
92  import org.apache.hadoop.hbase.master.balancer.ClusterStatusChore;
93  import org.apache.hadoop.hbase.master.balancer.LoadBalancerFactory;
94  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
95  import org.apache.hadoop.hbase.master.cleaner.LogCleaner;
96  import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
97  import org.apache.hadoop.hbase.master.handler.DeleteTableHandler;
98  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
99  import org.apache.hadoop.hbase.master.handler.DispatchMergingRegionHandler;
100 import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
101 import org.apache.hadoop.hbase.master.handler.ModifyTableHandler;
102 import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler;
103 import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
104 import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
105 import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
106 import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
107 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
108 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
109 import org.apache.hadoop.hbase.procedure.MasterProcedureManager;
110 import org.apache.hadoop.hbase.procedure.MasterProcedureManagerHost;
111 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
112 import org.apache.hadoop.hbase.protobuf.RequestConverter;
113 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
114 import org.apache.hadoop.hbase.protobuf.generated.*;
115 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
116 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionServerInfo;
117 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier.RegionSpecifierType;
118 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
119 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.ProcedureDescription;
120 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AddColumnResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionRequest;
123 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.AssignRegionResponse;
124 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceRequest;
125 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.BalanceResponse;
126 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateNamespaceResponse;
128 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableRequest;
129 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.CreateTableResponse;
130 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnRequest;
131 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteColumnResponse;
132 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceRequest;
133 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteNamespaceResponse;
134 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotRequest;
135 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteSnapshotResponse;
136 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableRequest;
137 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableResponse;
138 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableRequest;
139 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableResponse;
140 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsRequest;
141 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsResponse;
142 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorRequest;
143 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableCatalogJanitorResponse;
144 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableRequest;
145 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableResponse;
146 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusRequest;
147 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusResponse;
148 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsRequest;
149 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsResponse;
150 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorRequest;
151 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorResponse;
152 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusRequest;
153 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusResponse;
154 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
155 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
156 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
157 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesResponse;
158 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledRequest;
159 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
160 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
161 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
162 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest;
163 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse;
164 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneRequest;
165 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
166 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsRequest;
167 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
168 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
169 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
170 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
171 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ListTableNamesByNamespaceResponse;
172 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnRequest;
173 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyColumnResponse;
174 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceRequest;
175 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyNamespaceResponse;
176 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableRequest;
177 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ModifyTableResponse;
178 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionRequest;
179 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MoveRegionResponse;
180 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionRequest;
181 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.OfflineRegionResponse;
182 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotRequest;
183 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RestoreSnapshotResponse;
184 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanRequest;
185 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.RunCatalogScanResponse;
186 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningRequest;
187 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SetBalancerRunningResponse;
188 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownRequest;
189 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ShutdownResponse;
190 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotRequest;
191 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.SnapshotResponse;
192 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterRequest;
193 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterResponse;
194 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionRequest;
195 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionResponse;
196 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest;
197 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse;
198 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest;
199 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse;
200 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
201 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdResponse;
202 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
203 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportResponse;
204 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
205 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
206 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionTransition;
207 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorRequest;
208 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorResponse;
209 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionTransitionRequest;
210 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionTransitionResponse;
211 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
212 import org.apache.hadoop.hbase.replication.regionserver.Replication;
213 import org.apache.hadoop.hbase.security.UserProvider;
214 import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
215 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
216 import org.apache.hadoop.hbase.trace.SpanReceiverHost;
217 import org.apache.hadoop.hbase.util.Bytes;
218 import org.apache.hadoop.hbase.util.CompressionTest;
219 import org.apache.hadoop.hbase.util.FSTableDescriptors;
220 import org.apache.hadoop.hbase.util.FSUtils;
221 import org.apache.hadoop.hbase.util.HFileArchiveUtil;
222 import org.apache.hadoop.hbase.util.HasThread;
223 import org.apache.hadoop.hbase.util.InfoServer;
224 import org.apache.hadoop.hbase.util.JvmPauseMonitor;
225 import org.apache.hadoop.hbase.util.Pair;
226 import org.apache.hadoop.hbase.util.Sleeper;
227 import org.apache.hadoop.hbase.util.Strings;
228 import org.apache.hadoop.hbase.util.Threads;
229 import org.apache.hadoop.hbase.util.VersionInfo;
230 import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
231 import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker;
232 import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker;
233 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
234 import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
235 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
236 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
237 import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
238 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
239 import org.apache.hadoop.metrics.util.MBeanUtil;
240 import org.apache.hadoop.net.DNS;
241 import org.apache.zookeeper.KeeperException;
242 import org.apache.zookeeper.Watcher;
243 
244 import com.google.common.collect.Lists;
245 import com.google.common.collect.Maps;
246 import com.google.protobuf.Descriptors;
247 import com.google.protobuf.Message;
248 import com.google.protobuf.RpcCallback;
249 import com.google.protobuf.RpcController;
250 import com.google.protobuf.Service;
251 import com.google.protobuf.ServiceException;
252 
253 /**
254  * HMaster is the "master server" for HBase. An HBase cluster has one active
255  * master.  If many masters are started, all compete.  Whichever wins goes on to
256  * run the cluster.  All others park themselves in their constructor until
257  * master or cluster shutdown or until the active master loses its lease in
258  * zookeeper.  Thereafter, all running master jostle to take over master role.
259  *
260  * <p>The Master can be asked shutdown the cluster. See {@link #shutdown()}.  In
261  * this case it will tell all regionservers to go down and then wait on them
262  * all reporting in that they are down.  This master will then shut itself down.
263  *
264  * <p>You can also shutdown just this master.  Call {@link #stopMaster()}.
265  *
266  * @see Watcher
267  */
268 @InterfaceAudience.Private
269 @SuppressWarnings("deprecation")
270 public class HMaster extends HasThread implements MasterProtos.MasterService.BlockingInterface,
271 RegionServerStatusProtos.RegionServerStatusService.BlockingInterface,
272 MasterServices, Server {
273   private static final Log LOG = LogFactory.getLog(HMaster.class.getName());
274 
275   // MASTER is name of the webapp and the attribute name used stuffing this
276   //instance into web context.
277   public static final String MASTER = "master";
278 
279   // The configuration for the Master
280   private final Configuration conf;
281   // server for the web ui
282   private InfoServer infoServer;
283 
284   // Our zk client.
285   private ZooKeeperWatcher zooKeeper;
286   // Manager and zk listener for master election
287   private ActiveMasterManager activeMasterManager;
288   // Region server tracker
289   RegionServerTracker regionServerTracker;
290   // Draining region server tracker
291   private DrainingServerTracker drainingServerTracker;
292   // Tracker for load balancer state
293   private LoadBalancerTracker loadBalancerTracker;
294   // master address tracker
295   private MasterAddressTracker masterAddressTracker;
296 
297   // RPC server for the HMaster
298   private final RpcServerInterface rpcServer;
299   private JvmPauseMonitor pauseMonitor;
300   // Set after we've called HBaseServer#openServer and ready to receive RPCs.
301   // Set back to false after we stop rpcServer.  Used by tests.
302   private volatile boolean rpcServerOpen = false;
303 
304   /** Namespace stuff */
305   private TableNamespaceManager tableNamespaceManager;
306 
307   /**
308    * This servers address.
309    */
310   private final InetSocketAddress isa;
311 
312   // Metrics for the HMaster
313   private final MetricsMaster metricsMaster;
314   // file system manager for the master FS operations
315   private MasterFileSystem fileSystemManager;
316 
317   // server manager to deal with region server info
318   ServerManager serverManager;
319 
320   // manager of assignment nodes in zookeeper
321   AssignmentManager assignmentManager;
322   // manager of catalog regions
323   private CatalogTracker catalogTracker;
324   // Cluster status zk tracker and local setter
325   private ClusterStatusTracker clusterStatusTracker;
326 
327   // buffer for "fatal error" notices from region servers
328   // in the cluster. This is only used for assisting
329   // operations/debugging.
330   private MemoryBoundedLogMessageBuffer rsFatals;
331 
332   // This flag is for stopping this Master instance.  Its set when we are
333   // stopping or aborting
334   private volatile boolean stopped = false;
335   // Set on abort -- usually failure of our zk session.
336   private volatile boolean abort = false;
337   // flag set after we become the active master (used for testing)
338   private volatile boolean isActiveMaster = false;
339 
340   // flag set after we complete initialization once active,
341   // it is not private since it's used in unit tests
342   volatile boolean initialized = false;
343 
344   // flag set after we complete assignMeta.
345   private volatile boolean serverShutdownHandlerEnabled = false;
346 
347   // Instance of the hbase executor service.
348   ExecutorService executorService;
349 
350   private LoadBalancer balancer;
351   private Thread balancerChore;
352   private Thread clusterStatusChore;
353   private ClusterStatusPublisher clusterStatusPublisherChore = null;
354 
355   private CatalogJanitor catalogJanitorChore;
356   private LogCleaner logCleaner;
357   private HFileCleaner hfileCleaner;
358 
359   private MasterCoprocessorHost cpHost;
360   private final ServerName serverName;
361 
362   private TableDescriptors tableDescriptors;
363 
364   // Table level lock manager for schema changes
365   private TableLockManager tableLockManager;
366 
367   // Time stamps for when a hmaster was started and when it became active
368   private long masterStartTime;
369   private long masterActiveTime;
370 
371   /** time interval for emitting metrics values */
372   private final int msgInterval;
373   /**
374    * MX Bean for MasterInfo
375    */
376   private ObjectName mxBean = null;
377 
378   //should we check the compression codec type at master side, default true, HBASE-6370
379   private final boolean masterCheckCompression;
380 
381   private SpanReceiverHost spanReceiverHost;
382 
383   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
384 
385   // monitor for snapshot of hbase tables
386   private SnapshotManager snapshotManager;
387   // monitor for distributed procedures
388   private MasterProcedureManagerHost mpmHost;
389 
390   /** The health check chore. */
391   private HealthCheckChore healthCheckChore;
392 
393   /** flag used in test cases in order to simulate RS failures during master initialization */
394   private volatile boolean initializationBeforeMetaAssignment = false;
395 
396   /** The following is used in master recovery scenario to re-register listeners */
397   private List<ZooKeeperListener> registeredZKListenersBeforeRecovery;
398 
399   /**
400    * Initializes the HMaster. The steps are as follows:
401    * <p>
402    * <ol>
403    * <li>Initialize HMaster RPC and address
404    * <li>Connect to ZooKeeper.
405    * </ol>
406    * <p>
407    * Remaining steps of initialization occur in {@link #run()} so that they
408    * run in their own thread rather than within the context of the constructor.
409    * @throws InterruptedException
410    */
411   public HMaster(final Configuration conf)
412   throws IOException, KeeperException, InterruptedException {
413     this.conf = new Configuration(conf);
414     // Disable the block cache on the master
415     this.conf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
416     FSUtils.setupShortCircuitRead(conf);
417     // Server to handle client requests.
418     String hostname = Strings.domainNamePointerToHostName(DNS.getDefaultHost(
419       conf.get("hbase.master.dns.interface", "default"),
420       conf.get("hbase.master.dns.nameserver", "default")));
421     int port = conf.getInt(HConstants.MASTER_PORT, HConstants.DEFAULT_MASTER_PORT);
422     // Test that the hostname is reachable
423     InetSocketAddress initialIsa = new InetSocketAddress(hostname, port);
424     if (initialIsa.getAddress() == null) {
425       throw new IllegalArgumentException("Failed resolve of hostname " + initialIsa);
426     }
427     // Verify that the bind address is reachable if set
428     String bindAddress = conf.get("hbase.master.ipc.address");
429     if (bindAddress != null) {
430       initialIsa = new InetSocketAddress(bindAddress, port);
431       if (initialIsa.getAddress() == null) {
432         throw new IllegalArgumentException("Failed resolve of bind address " + initialIsa);
433       }
434     }
435     String name = "master/" + initialIsa.toString();
436     // Set how many times to retry talking to another server over HConnection.
437     HConnectionManager.setServerSideHConnectionRetries(this.conf, name, LOG);
438     int numHandlers = conf.getInt(HConstants.MASTER_HANDLER_COUNT,
439       conf.getInt(HConstants.REGION_SERVER_HANDLER_COUNT, HConstants.DEFAULT_MASTER_HANLDER_COUNT));
440     this.rpcServer = new RpcServer(this, name, getServices(),
441       initialIsa, // BindAddress is IP we got for this server.
442       conf,
443       new FifoRpcScheduler(conf, numHandlers));
444     // Set our address.
445     this.isa = this.rpcServer.getListenerAddress();
446     // We don't want to pass isa's hostname here since it could be 0.0.0.0
447     this.serverName = ServerName.valueOf(hostname, this.isa.getPort(), System.currentTimeMillis());
448     this.rsFatals = new MemoryBoundedLogMessageBuffer(
449       conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));
450 
451     // login the zookeeper client principal (if using security)
452     ZKUtil.loginClient(this.conf, "hbase.zookeeper.client.keytab.file",
453       "hbase.zookeeper.client.kerberos.principal", this.isa.getHostName());
454 
455     // initialize server principal (if using secure Hadoop)
456     UserProvider provider = UserProvider.instantiate(conf);
457     provider.login("hbase.master.keytab.file",
458       "hbase.master.kerberos.principal", this.isa.getHostName());
459 
460     LOG.info("hbase.rootdir=" + FSUtils.getRootDir(this.conf) +
461         ", hbase.cluster.distributed=" + this.conf.getBoolean("hbase.cluster.distributed", false));
462 
463     // set the thread name now we have an address
464     setName(MASTER + ":" + this.serverName.toShortString());
465 
466     Replication.decorateMasterConfiguration(this.conf);
467 
468     // Hack! Maps DFSClient => Master for logs.  HDFS made this
469     // config param for task trackers, but we can piggyback off of it.
470     if (this.conf.get("mapred.task.id") == null) {
471       this.conf.set("mapred.task.id", "hb_m_" + this.serverName.toString());
472     }
473 
474     this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true);
475     this.rpcServer.startThreads();
476     this.pauseMonitor = new JvmPauseMonitor(conf);
477     this.pauseMonitor.start();
478 
479     // metrics interval: using the same property as region server.
480     this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
481 
482     //should we check the compression codec type at master side, default true, HBASE-6370
483     this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
484 
485     this.metricsMaster = new MetricsMaster( new MetricsMasterWrapperImpl(this));
486 
487     // Health checker thread.
488     int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
489       HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
490     if (isHealthCheckerConfigured()) {
491       healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
492     }
493 
494     // Do we publish the status?
495     boolean shouldPublish = conf.getBoolean(HConstants.STATUS_PUBLISHED,
496         HConstants.STATUS_PUBLISHED_DEFAULT);
497     Class<? extends ClusterStatusPublisher.Publisher> publisherClass =
498         conf.getClass(ClusterStatusPublisher.STATUS_PUBLISHER_CLASS,
499             ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS,
500             ClusterStatusPublisher.Publisher.class);
501 
502     if (shouldPublish) {
503       if (publisherClass == null) {
504         LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
505             ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS +
506             " is not set - not publishing status");
507       } else {
508         clusterStatusPublisherChore = new ClusterStatusPublisher(this, conf, publisherClass);
509         Threads.setDaemonThreadRunning(clusterStatusPublisherChore.getThread());
510       }
511     }
512   }
513 
514   /**
515    * @return list of blocking services and their security info classes that this server supports
516    */
517   private List<BlockingServiceAndInterface> getServices() {
518     List<BlockingServiceAndInterface> bssi = new ArrayList<BlockingServiceAndInterface>(3);
519     bssi.add(new BlockingServiceAndInterface(
520         MasterProtos.MasterService.newReflectiveBlockingService(this),
521         MasterProtos.MasterService.BlockingInterface.class));
522     bssi.add(new BlockingServiceAndInterface(
523         RegionServerStatusProtos.RegionServerStatusService.newReflectiveBlockingService(this),
524         RegionServerStatusProtos.RegionServerStatusService.BlockingInterface.class));
525     return bssi;
526   }
527 
528   /**
529    * Stall startup if we are designated a backup master; i.e. we want someone
530    * else to become the master before proceeding.
531    * @param c configuration
532    * @param amm
533    * @throws InterruptedException
534    */
535   private static void stallIfBackupMaster(final Configuration c,
536       final ActiveMasterManager amm)
537   throws InterruptedException {
538     // If we're a backup master, stall until a primary to writes his address
539     if (!c.getBoolean(HConstants.MASTER_TYPE_BACKUP,
540       HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
541       return;
542     }
543     LOG.debug("HMaster started in backup mode.  " +
544       "Stalling until master znode is written.");
545     // This will only be a minute or so while the cluster starts up,
546     // so don't worry about setting watches on the parent znode
547     while (!amm.isActiveMaster()) {
548       LOG.debug("Waiting for master address ZNode to be written " +
549         "(Also watching cluster state node)");
550       Thread.sleep(
551         c.getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT));
552     }
553 
554   }
555 
556   MetricsMaster getMetrics() {
557     return metricsMaster;
558   }
559 
560   /**
561    * Main processing loop for the HMaster.
562    * <ol>
563    * <li>Block until becoming active master
564    * <li>Finish initialization via finishInitialization(MonitoredTask)
565    * <li>Enter loop until we are stopped
566    * <li>Stop services and perform cleanup once stopped
567    * </ol>
568    */
569   @Override
570   public void run() {
571     MonitoredTask startupStatus =
572       TaskMonitor.get().createStatus("Master startup");
573     startupStatus.setDescription("Master startup");
574     masterStartTime = System.currentTimeMillis();
575     try {
576       this.masterAddressTracker = new MasterAddressTracker(getZooKeeperWatcher(), this);
577       this.masterAddressTracker.start();
578 
579       // Put up info server.
580       int port = this.conf.getInt("hbase.master.info.port", 60010);
581       if (port >= 0) {
582         String a = this.conf.get("hbase.master.info.bindAddress", "0.0.0.0");
583         this.infoServer = new InfoServer(MASTER, a, port, false, this.conf);
584         this.infoServer.addServlet("status", "/master-status", MasterStatusServlet.class);
585         this.infoServer.addServlet("dump", "/dump", MasterDumpServlet.class);
586         this.infoServer.setAttribute(MASTER, this);
587         this.infoServer.start();
588       }
589 
590       this.registeredZKListenersBeforeRecovery = this.zooKeeper.getListeners();
591       /*
592        * Block on becoming the active master.
593        *
594        * We race with other masters to write our address into ZooKeeper.  If we
595        * succeed, we are the primary/active master and finish initialization.
596        *
597        * If we do not succeed, there is another active master and we should
598        * now wait until it dies to try and become the next active master.  If we
599        * do not succeed on our first attempt, this is no longer a cluster startup.
600        */
601       becomeActiveMaster(startupStatus);
602 
603       // We are either the active master or we were asked to shutdown
604       if (!this.stopped) {
605         finishInitialization(startupStatus, false);
606         loop();
607       }
608     } catch (Throwable t) {
609       // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
610       if (t instanceof NoClassDefFoundError &&
611           t.getMessage().contains("org/apache/hadoop/hdfs/protocol/FSConstants$SafeModeAction")) {
612           // improved error message for this special case
613           abort("HBase is having a problem with its Hadoop jars.  You may need to "
614               + "recompile HBase against Hadoop version "
615               +  org.apache.hadoop.util.VersionInfo.getVersion()
616               + " or change your hadoop jars to start properly", t);
617       } else {
618         abort("Unhandled exception. Starting shutdown.", t);
619       }
620     } finally {
621       startupStatus.cleanup();
622 
623       stopChores();
624       // Wait for all the remaining region servers to report in IFF we were
625       // running a cluster shutdown AND we were NOT aborting.
626       if (!this.abort && this.serverManager != null &&
627           this.serverManager.isClusterShutdown()) {
628         this.serverManager.letRegionServersShutdown();
629       }
630       stopServiceThreads();
631       // Stop services started for both backup and active masters
632       if (this.activeMasterManager != null) this.activeMasterManager.stop();
633       if (this.catalogTracker != null) this.catalogTracker.stop();
634       if (this.serverManager != null) this.serverManager.stop();
635       if (this.assignmentManager != null) this.assignmentManager.stop();
636       if (this.fileSystemManager != null) this.fileSystemManager.stop();
637       if (this.mpmHost != null) this.mpmHost.stop("server shutting down.");
638       this.zooKeeper.close();
639     }
640     LOG.info("HMaster main thread exiting");
641   }
642 
643   /**
644    * Useful for testing purpose also where we have
645    * master restart scenarios.
646    */
647   protected void startCatalogJanitorChore() {
648     Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
649   }
650 
651   /**
652    * Try becoming active master.
653    * @param startupStatus
654    * @return True if we could successfully become the active master.
655    * @throws InterruptedException
656    */
657   private boolean becomeActiveMaster(MonitoredTask startupStatus)
658   throws InterruptedException {
659     // TODO: This is wrong!!!! Should have new servername if we restart ourselves,
660     // if we come back to life.
661     this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName,
662         this);
663     this.zooKeeper.registerListener(activeMasterManager);
664     stallIfBackupMaster(this.conf, this.activeMasterManager);
665 
666     // The ClusterStatusTracker is setup before the other
667     // ZKBasedSystemTrackers because it's needed by the activeMasterManager
668     // to check if the cluster should be shutdown.
669     this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
670     this.clusterStatusTracker.start();
671     return this.activeMasterManager.blockUntilBecomingActiveMaster(startupStatus);
672   }
673 
674   /**
675    * Initialize all ZK based system trackers.
676    * @throws IOException
677    * @throws InterruptedException
678    */
679   void initializeZKBasedSystemTrackers() throws IOException,
680       InterruptedException, KeeperException {
681     this.catalogTracker = createCatalogTracker(this.zooKeeper, this.conf, this);
682     this.catalogTracker.start();
683 
684     this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
685     this.loadBalancerTracker = new LoadBalancerTracker(zooKeeper, this);
686     this.loadBalancerTracker.start();
687     this.assignmentManager = new AssignmentManager(this, serverManager,
688       this.catalogTracker, this.balancer, this.executorService, this.metricsMaster,
689       this.tableLockManager);
690     zooKeeper.registerListenerFirst(assignmentManager);
691 
692     this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
693         this.serverManager);
694     this.regionServerTracker.start();
695 
696     this.drainingServerTracker = new DrainingServerTracker(zooKeeper, this,
697       this.serverManager);
698     this.drainingServerTracker.start();
699 
700     // Set the cluster as up.  If new RSs, they'll be waiting on this before
701     // going ahead with their startup.
702     boolean wasUp = this.clusterStatusTracker.isClusterUp();
703     if (!wasUp) this.clusterStatusTracker.setClusterUp();
704 
705     LOG.info("Server active/primary master=" + this.serverName +
706         ", sessionid=0x" +
707         Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()) +
708         ", setting cluster-up flag (Was=" + wasUp + ")");
709 
710     // create/initialize the snapshot manager and other procedure managers
711     this.snapshotManager = new SnapshotManager();
712     this.mpmHost = new MasterProcedureManagerHost();
713     this.mpmHost.register(this.snapshotManager);
714     this.mpmHost.loadProcedures(conf);
715     this.mpmHost.initialize(this, this.metricsMaster);
716   }
717 
718   /**
719    * Create CatalogTracker.
720    * In its own method so can intercept and mock it over in tests.
721    * @param zk If zk is null, we'll create an instance (and shut it down
722    * when {@link #stop(String)} is called) else we'll use what is passed.
723    * @param conf
724    * @param abortable If fatal exception we'll call abort on this.  May be null.
725    * If it is we'll use the Connection associated with the passed
726    * {@link Configuration} as our {@link Abortable}.
727    * ({@link Object#wait(long)} when passed a <code>0</code> waits for ever).
728    * @throws IOException
729    */
730   CatalogTracker createCatalogTracker(final ZooKeeperWatcher zk,
731       final Configuration conf, Abortable abortable)
732   throws IOException {
733     return new CatalogTracker(zk, conf, abortable);
734   }
735 
736   // Check if we should stop every 100ms
737   private Sleeper stopSleeper = new Sleeper(100, this);
738 
739   private void loop() {
740     long lastMsgTs = 0l;
741     long now = 0l;
742     while (!this.stopped) {
743       now = System.currentTimeMillis();
744       if ((now - lastMsgTs) >= this.msgInterval) {
745         doMetrics();
746         lastMsgTs = System.currentTimeMillis();
747       }
748       stopSleeper.sleep();
749     }
750   }
751 
752   /**
753    * Emit the HMaster metrics, such as region in transition metrics.
754    * Surrounding in a try block just to be sure metrics doesn't abort HMaster.
755    */
756   private void doMetrics() {
757     try {
758       this.assignmentManager.updateRegionsInTransitionMetrics();
759     } catch (Throwable e) {
760       LOG.error("Couldn't update metrics: " + e.getMessage());
761     }
762   }
763 
764   /**
765    * Finish initialization of HMaster after becoming the primary master.
766    *
767    * <ol>
768    * <li>Initialize master components - file system manager, server manager,
769    *     assignment manager, region server tracker, catalog tracker, etc</li>
770    * <li>Start necessary service threads - rpc server, info server,
771    *     executor services, etc</li>
772    * <li>Set cluster as UP in ZooKeeper</li>
773    * <li>Wait for RegionServers to check-in</li>
774    * <li>Split logs and perform data recovery, if necessary</li>
775    * <li>Ensure assignment of meta regions<li>
776    * <li>Handle either fresh cluster start or master failover</li>
777    * </ol>
778    *
779    * @param masterRecovery
780    *
781    * @throws IOException
782    * @throws InterruptedException
783    * @throws KeeperException
784    */
785   private void finishInitialization(MonitoredTask status, boolean masterRecovery)
786   throws IOException, InterruptedException, KeeperException {
787 
788     isActiveMaster = true;
789 
790     /*
791      * We are active master now... go initialize components we need to run.
792      * Note, there may be dross in zk from previous runs; it'll get addressed
793      * below after we determine if cluster startup or failover.
794      */
795 
796     status.setStatus("Initializing Master file system");
797 
798     this.masterActiveTime = System.currentTimeMillis();
799     // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
800     this.fileSystemManager = new MasterFileSystem(this, this, masterRecovery);
801 
802     this.tableDescriptors =
803       new FSTableDescriptors(this.fileSystemManager.getFileSystem(),
804       this.fileSystemManager.getRootDir());
805 
806     // publish cluster ID
807     status.setStatus("Publishing Cluster ID in ZooKeeper");
808     ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
809 
810     if (!masterRecovery) {
811       this.executorService = new ExecutorService(getServerName().toShortString());
812       this.serverManager = createServerManager(this, this);
813     }
814 
815     //Initialize table lock manager, and ensure that all write locks held previously
816     //are invalidated
817     this.tableLockManager = TableLockManager.createTableLockManager(conf, zooKeeper, serverName);
818     if (!masterRecovery) {
819       this.tableLockManager.reapWriteLocks();
820     }
821 
822     status.setStatus("Initializing ZK system trackers");
823     initializeZKBasedSystemTrackers();
824 
825     if (!masterRecovery) {
826       // initialize master side coprocessors before we start handling requests
827       status.setStatus("Initializing master coprocessors");
828       this.cpHost = new MasterCoprocessorHost(this, this.conf);
829 
830       spanReceiverHost = SpanReceiverHost.getInstance(getConfiguration());
831 
832       // start up all service threads.
833       status.setStatus("Initializing master service threads");
834       startServiceThreads();
835     }
836 
837     // Wait for region servers to report in.
838     this.serverManager.waitForRegionServers(status);
839     // Check zk for region servers that are up but didn't register
840     for (ServerName sn: this.regionServerTracker.getOnlineServers()) {
841       // The isServerOnline check is opportunistic, correctness is handled inside
842       if (!this.serverManager.isServerOnline(sn)
843           && serverManager.checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
844         LOG.info("Registered server found up in zk but who has not yet reported in: " + sn);
845       }
846     }
847 
848     if (!masterRecovery) {
849       this.assignmentManager.startTimeOutMonitor();
850     }
851 
852     // get a list for previously failed RS which need log splitting work
853     // we recover hbase:meta region servers inside master initialization and
854     // handle other failed servers in SSH in order to start up master node ASAP
855     Set<ServerName> previouslyFailedServers = this.fileSystemManager
856         .getFailedServersFromLogFolders();
857 
858     // remove stale recovering regions from previous run
859     this.fileSystemManager.removeStaleRecoveringRegionsFromZK(previouslyFailedServers);
860 
861     // log splitting for hbase:meta server
862     ServerName oldMetaServerLocation = this.catalogTracker.getMetaLocation();
863     if (oldMetaServerLocation != null && previouslyFailedServers.contains(oldMetaServerLocation)) {
864       splitMetaLogBeforeAssignment(oldMetaServerLocation);
865       // Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
866       // may also host user regions
867     }
868     Set<ServerName> previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
869     // need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
870     // instead of previouslyFailedMetaRSs alone to address the following two situations:
871     // 1) the chained failure situation(recovery failed multiple times in a row).
872     // 2) master get killed right before it could delete the recovering hbase:meta from ZK while the
873     // same server still has non-meta wals to be replayed so that
874     // removeStaleRecoveringRegionsFromZK can't delete the stale hbase:meta region
875     // Passing more servers into splitMetaLog is all right. If a server doesn't have hbase:meta wal,
876     // there is no op for the server.
877     previouslyFailedMetaRSs.addAll(previouslyFailedServers);
878 
879     this.initializationBeforeMetaAssignment = true;
880 
881     //initialize load balancer
882     this.balancer.setClusterStatus(getClusterStatus());
883     this.balancer.setMasterServices(this);
884     this.balancer.initialize();
885 
886     // Make sure meta assigned before proceeding.
887     status.setStatus("Assigning Meta Region");
888     assignMeta(status, previouslyFailedMetaRSs);
889     // check if master is shutting down because above assignMeta could return even hbase:meta isn't
890     // assigned when master is shutting down
891     if(this.stopped) return;
892 
893     status.setStatus("Submitting log splitting work for previously failed region servers");
894     // Master has recovered hbase:meta region server and we put
895     // other failed region servers in a queue to be handled later by SSH
896     for (ServerName tmpServer : previouslyFailedServers) {
897       this.serverManager.processDeadServer(tmpServer, true);
898     }
899 
900     // Update meta with new PB serialization if required. i.e migrate all HRI to PB serialization
901     // in meta. This must happen before we assign all user regions or else the assignment will
902     // fail.
903     if (this.conf.getBoolean("hbase.MetaMigrationConvertingToPB", true)) {
904       org.apache.hadoop.hbase.catalog.MetaMigrationConvertingToPB.updateMetaIfNecessary(this);
905     }
906 
907     // Fix up assignment manager status
908     status.setStatus("Starting assignment manager");
909     this.assignmentManager.joinCluster();
910 
911     //set cluster status again after user regions are assigned
912     this.balancer.setClusterStatus(getClusterStatus());
913 
914     if (!masterRecovery) {
915       // Start balancer and meta catalog janitor after meta and regions have
916       // been assigned.
917       status.setStatus("Starting balancer and catalog janitor");
918       this.clusterStatusChore = getAndStartClusterStatusChore(this);
919       this.balancerChore = getAndStartBalancerChore(this);
920       this.catalogJanitorChore = new CatalogJanitor(this, this);
921       startCatalogJanitorChore();
922     }
923 
924     status.setStatus("Starting namespace manager");
925     initNamespace();
926 
927     if (this.cpHost != null) {
928       try {
929         this.cpHost.preMasterInitialization();
930       } catch (IOException e) {
931         LOG.error("Coprocessor preMasterInitialization() hook failed", e);
932       }
933     }
934 
935     status.markComplete("Initialization successful");
936     LOG.info("Master has completed initialization");
937     initialized = true;
938     // clear the dead servers with same host name and port of online server because we are not
939     // removing dead server with same hostname and port of rs which is trying to check in before
940     // master initialization. See HBASE-5916.
941     this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
942 
943     if (!masterRecovery) {
944       if (this.cpHost != null) {
945         // don't let cp initialization errors kill the master
946         try {
947           this.cpHost.postStartMaster();
948         } catch (IOException ioe) {
949           LOG.error("Coprocessor postStartMaster() hook failed", ioe);
950         }
951       }
952     }
953   }
954 
955   /**
956    * Create a {@link ServerManager} instance.
957    * @param master
958    * @param services
959    * @return An instance of {@link ServerManager}
960    * @throws org.apache.hadoop.hbase.ZooKeeperConnectionException
961    * @throws IOException
962    */
963   ServerManager createServerManager(final Server master,
964       final MasterServices services)
965   throws IOException {
966     // We put this out here in a method so can do a Mockito.spy and stub it out
967     // w/ a mocked up ServerManager.
968     return new ServerManager(master, services);
969   }
970 
971   /**
972    * Check <code>hbase:meta</code> is assigned. If not, assign it.
973    * @param status MonitoredTask
974    * @param previouslyFailedMetaRSs
975    * @throws InterruptedException
976    * @throws IOException
977    * @throws KeeperException
978    */
979   void assignMeta(MonitoredTask status, Set<ServerName> previouslyFailedMetaRSs)
980       throws InterruptedException, IOException, KeeperException {
981     // Work on meta region
982     int assigned = 0;
983     long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 1000);
984     status.setStatus("Assigning hbase:meta region");
985 
986     RegionStates regionStates = assignmentManager.getRegionStates();
987     regionStates.createRegionState(HRegionInfo.FIRST_META_REGIONINFO);
988     boolean rit = this.assignmentManager
989       .processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO);
990     boolean metaRegionLocation = this.catalogTracker.verifyMetaRegionLocation(timeout);
991     ServerName currentMetaServer = this.catalogTracker.getMetaLocation();
992     if (!metaRegionLocation) {
993       // Meta location is not verified. It should be in transition, or offline.
994       // We will wait for it to be assigned in enableSSHandWaitForMeta below.
995       assigned++;
996       if (!rit) {
997         // Assign meta since not already in transition
998         if (currentMetaServer != null) {
999           // If the meta server is not known to be dead or online,
1000           // just split the meta log, and don't expire it since this
1001           // could be a full cluster restart. Otherwise, we will think
1002           // this is a failover and lose previous region locations.
1003           // If it is really a failover case, AM will find out in rebuilding
1004           // user regions. Otherwise, we are good since all logs are split
1005           // or known to be replayed before user regions are assigned.
1006           if (serverManager.isServerOnline(currentMetaServer)) {
1007             LOG.info("Forcing expire of " + currentMetaServer);
1008             serverManager.expireServer(currentMetaServer);
1009           }
1010           splitMetaLogBeforeAssignment(currentMetaServer);
1011           previouslyFailedMetaRSs.add(currentMetaServer);
1012         }
1013         assignmentManager.assignMeta();
1014       }
1015     } else {
1016       // Region already assigned. We didn't assign it. Add to in-memory state.
1017       regionStates.updateRegionState(
1018         HRegionInfo.FIRST_META_REGIONINFO, State.OPEN, currentMetaServer);
1019       this.assignmentManager.regionOnline(
1020         HRegionInfo.FIRST_META_REGIONINFO, currentMetaServer);
1021     }
1022 
1023     enableMeta(TableName.META_TABLE_NAME);
1024 
1025     if ((RecoveryMode.LOG_REPLAY == this.getMasterFileSystem().getLogRecoveryMode())
1026         && (!previouslyFailedMetaRSs.isEmpty())) {
1027       // replay WAL edits mode need new hbase:meta RS is assigned firstly
1028       status.setStatus("replaying log for Meta Region");
1029       this.fileSystemManager.splitMetaLog(previouslyFailedMetaRSs);
1030     }
1031 
1032     // Make sure a hbase:meta location is set. We need to enable SSH here since
1033     // if the meta region server is died at this time, we need it to be re-assigned
1034     // by SSH so that system tables can be assigned.
1035     // No need to wait for meta is assigned = 0 when meta is just verified.
1036     enableServerShutdownHandler(assigned != 0);
1037 
1038     LOG.info("hbase:meta assigned=" + assigned + ", rit=" + rit +
1039       ", location=" + catalogTracker.getMetaLocation());
1040     status.setStatus("META assigned.");
1041   }
1042 
1043   void initNamespace() throws IOException {
1044     //create namespace manager
1045     tableNamespaceManager = new TableNamespaceManager(this);
1046     tableNamespaceManager.start();
1047   }
1048 
1049   private void splitMetaLogBeforeAssignment(ServerName currentMetaServer) throws IOException {
1050     if (RecoveryMode.LOG_REPLAY == this.getMasterFileSystem().getLogRecoveryMode()) {
1051       // In log replay mode, we mark hbase:meta region as recovering in ZK
1052       Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
1053       regions.add(HRegionInfo.FIRST_META_REGIONINFO);
1054       this.fileSystemManager.prepareLogReplay(currentMetaServer, regions);
1055     } else {
1056       // In recovered.edits mode: create recovered edits file for hbase:meta server
1057       this.fileSystemManager.splitMetaLog(currentMetaServer);
1058     }
1059   }
1060 
1061   private void enableServerShutdownHandler(
1062       final boolean waitForMeta) throws IOException, InterruptedException {
1063     // If ServerShutdownHandler is disabled, we enable it and expire those dead
1064     // but not expired servers. This is required so that if meta is assigning to
1065     // a server which dies after assignMeta starts assignment,
1066     // SSH can re-assign it. Otherwise, we will be
1067     // stuck here waiting forever if waitForMeta is specified.
1068     if (!serverShutdownHandlerEnabled) {
1069       serverShutdownHandlerEnabled = true;
1070       this.serverManager.processQueuedDeadServers();
1071     }
1072 
1073     if (waitForMeta) {
1074       this.catalogTracker.waitForMeta();
1075       // Above check waits for general meta availability but this does not
1076       // guarantee that the transition has completed
1077       this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
1078     }
1079   }
1080 
1081   private void enableMeta(TableName metaTableName) {
1082     if (!this.assignmentManager.getZKTable().isEnabledTable(metaTableName)) {
1083       this.assignmentManager.setEnabledTable(metaTableName);
1084     }
1085   }
1086 
1087   /**
1088    * This function returns a set of region server names under hbase:meta recovering region ZK node
1089    * @return Set of meta server names which were recorded in ZK
1090    * @throws KeeperException
1091    */
1092   private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
1093     Set<ServerName> result = new HashSet<ServerName>();
1094     String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.recoveringRegionsZNode,
1095       HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
1096     List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
1097     if (regionFailedServers == null) return result;
1098 
1099     for(String failedServer : regionFailedServers) {
1100       ServerName server = ServerName.parseServerName(failedServer);
1101       result.add(server);
1102     }
1103     return result;
1104   }
1105 
1106   @Override
1107   public TableDescriptors getTableDescriptors() {
1108     return this.tableDescriptors;
1109   }
1110 
1111   /** @return InfoServer object. Maybe null.*/
1112   public InfoServer getInfoServer() {
1113     return this.infoServer;
1114   }
1115 
1116   @Override
1117   public Configuration getConfiguration() {
1118     return this.conf;
1119   }
1120 
1121   @Override
1122   public ServerManager getServerManager() {
1123     return this.serverManager;
1124   }
1125 
1126   @Override
1127   public ExecutorService getExecutorService() {
1128     return this.executorService;
1129   }
1130 
1131   @Override
1132   public MasterFileSystem getMasterFileSystem() {
1133     return this.fileSystemManager;
1134   }
1135 
1136   /**
1137    * Get the ZK wrapper object - needed by master_jsp.java
1138    * @return the zookeeper wrapper
1139    */
1140   public ZooKeeperWatcher getZooKeeperWatcher() {
1141     return this.zooKeeper;
1142   }
1143 
1144   public ActiveMasterManager getActiveMasterManager() {
1145     return this.activeMasterManager;
1146   }
1147 
1148   public MasterAddressTracker getMasterAddressTracker() {
1149     return this.masterAddressTracker;
1150   }
1151 
1152   /*
1153    * Start up all services. If any of these threads gets an unhandled exception
1154    * then they just die with a logged message.  This should be fine because
1155    * in general, we do not expect the master to get such unhandled exceptions
1156    *  as OOMEs; it should be lightly loaded. See what HRegionServer does if
1157    *  need to install an unexpected exception handler.
1158    */
1159   void startServiceThreads() throws IOException{
1160    // Start the executor service pools
1161    this.executorService.startExecutorService(ExecutorType.MASTER_OPEN_REGION,
1162       conf.getInt("hbase.master.executor.openregion.threads", 5));
1163    this.executorService.startExecutorService(ExecutorType.MASTER_CLOSE_REGION,
1164       conf.getInt("hbase.master.executor.closeregion.threads", 5));
1165    this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
1166       conf.getInt("hbase.master.executor.serverops.threads", 5));
1167    this.executorService.startExecutorService(ExecutorType.MASTER_META_SERVER_OPERATIONS,
1168       conf.getInt("hbase.master.executor.serverops.threads", 5));
1169    this.executorService.startExecutorService(ExecutorType.M_LOG_REPLAY_OPS,
1170       conf.getInt("hbase.master.executor.logreplayops.threads", 10));
1171 
1172    // We depend on there being only one instance of this executor running
1173    // at a time.  To do concurrency, would need fencing of enable/disable of
1174    // tables.
1175    // Any time changing this maxThreads to > 1, pls see the comment at
1176    // AccessController#postCreateTableHandler
1177    this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS, 1);
1178 
1179    // Start log cleaner thread
1180    String n = Thread.currentThread().getName();
1181    int cleanerInterval = conf.getInt("hbase.master.cleaner.interval", 60 * 1000);
1182    this.logCleaner =
1183       new LogCleaner(cleanerInterval,
1184          this, conf, getMasterFileSystem().getFileSystem(),
1185          getMasterFileSystem().getOldLogDir());
1186          Threads.setDaemonThreadRunning(logCleaner.getThread(), n + ".oldLogCleaner");
1187 
1188    //start the hfile archive cleaner thread
1189     Path archiveDir = HFileArchiveUtil.getArchivePath(conf);
1190     this.hfileCleaner = new HFileCleaner(cleanerInterval, this, conf, getMasterFileSystem()
1191         .getFileSystem(), archiveDir);
1192     Threads.setDaemonThreadRunning(hfileCleaner.getThread(), n + ".archivedHFileCleaner");
1193 
1194     // Start the health checker
1195     if (this.healthCheckChore != null) {
1196       Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker");
1197     }
1198 
1199     // Start allowing requests to happen.
1200     this.rpcServer.openServer();
1201     this.rpcServerOpen = true;
1202     if (LOG.isTraceEnabled()) {
1203       LOG.trace("Started service threads");
1204     }
1205   }
1206 
1207   /**
1208    * Use this when trying to figure when its ok to send in rpcs.  Used by tests.
1209    * @return True if we have successfully run {@link RpcServer#openServer()}
1210    */
1211   boolean isRpcServerOpen() {
1212     return this.rpcServerOpen;
1213   }
1214 
1215   private void stopServiceThreads() {
1216     if (LOG.isDebugEnabled()) {
1217       LOG.debug("Stopping service threads");
1218     }
1219     if (this.rpcServer != null) this.rpcServer.stop();
1220     this.rpcServerOpen = false;
1221     // Clean up and close up shop
1222     if (this.logCleaner!= null) this.logCleaner.interrupt();
1223     if (this.hfileCleaner != null) this.hfileCleaner.interrupt();
1224 
1225     if (this.infoServer != null) {
1226       LOG.info("Stopping infoServer");
1227       try {
1228         this.infoServer.stop();
1229       } catch (Exception ex) {
1230         ex.printStackTrace();
1231       }
1232     }
1233     if (this.executorService != null) this.executorService.shutdown();
1234     if (this.healthCheckChore != null) {
1235       this.healthCheckChore.interrupt();
1236     }
1237     if (this.pauseMonitor != null) {
1238       this.pauseMonitor.stop();
1239     }
1240   }
1241 
1242   private static Thread getAndStartClusterStatusChore(HMaster master) {
1243     if (master == null || master.balancer == null) {
1244       return null;
1245     }
1246     Chore chore = new ClusterStatusChore(master, master.balancer);
1247     return Threads.setDaemonThreadRunning(chore.getThread());
1248   }
1249 
1250   private static Thread getAndStartBalancerChore(final HMaster master) {
1251     // Start up the load balancer chore
1252     Chore chore = new BalancerChore(master);
1253     return Threads.setDaemonThreadRunning(chore.getThread());
1254   }
1255 
1256   private void stopChores() {
1257     if (this.balancerChore != null) {
1258       this.balancerChore.interrupt();
1259     }
1260     if (this.clusterStatusChore != null) {
1261       this.clusterStatusChore.interrupt();
1262     }
1263     if (this.catalogJanitorChore != null) {
1264       this.catalogJanitorChore.interrupt();
1265     }
1266     if (this.clusterStatusPublisherChore != null){
1267       clusterStatusPublisherChore.interrupt();
1268     }
1269   }
1270 
1271   @Override
1272   public RegionServerStartupResponse regionServerStartup(
1273       RpcController controller, RegionServerStartupRequest request) throws ServiceException {
1274     // Register with server manager
1275     try {
1276       InetAddress ia = getRemoteInetAddress(request.getPort(), request.getServerStartCode());
1277       ServerName rs = this.serverManager.regionServerStartup(ia, request.getPort(),
1278         request.getServerStartCode(), request.getServerCurrentTime());
1279 
1280       // Send back some config info
1281       RegionServerStartupResponse.Builder resp = createConfigurationSubset();
1282       NameStringPair.Builder entry = NameStringPair.newBuilder()
1283         .setName(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)
1284         .setValue(rs.getHostname());
1285       resp.addMapEntries(entry.build());
1286 
1287       return resp.build();
1288     } catch (IOException ioe) {
1289       throw new ServiceException(ioe);
1290     }
1291   }
1292 
1293   /**
1294    * @return Get remote side's InetAddress
1295    * @throws UnknownHostException
1296    */
1297   InetAddress getRemoteInetAddress(final int port, final long serverStartCode)
1298   throws UnknownHostException {
1299     // Do it out here in its own little method so can fake an address when
1300     // mocking up in tests.
1301     return RpcServer.getRemoteIp();
1302   }
1303 
1304   /**
1305    * @return Subset of configuration to pass initializing regionservers: e.g.
1306    * the filesystem to use and root directory to use.
1307    */
1308   protected RegionServerStartupResponse.Builder createConfigurationSubset() {
1309     RegionServerStartupResponse.Builder resp = addConfig(
1310       RegionServerStartupResponse.newBuilder(), HConstants.HBASE_DIR);
1311     resp = addConfig(resp, "fs.default.name");
1312     return addConfig(resp, "hbase.master.info.port");
1313   }
1314 
1315   private RegionServerStartupResponse.Builder addConfig(
1316       final RegionServerStartupResponse.Builder resp, final String key) {
1317     NameStringPair.Builder entry = NameStringPair.newBuilder()
1318       .setName(key)
1319       .setValue(this.conf.get(key));
1320     resp.addMapEntries(entry.build());
1321     return resp;
1322   }
1323 
1324   @Override
1325   public GetLastFlushedSequenceIdResponse getLastFlushedSequenceId(RpcController controller,
1326       GetLastFlushedSequenceIdRequest request) throws ServiceException {
1327     byte[] regionName = request.getRegionName().toByteArray();
1328     long seqId = serverManager.getLastFlushedSequenceId(regionName);
1329     return ResponseConverter.buildGetLastFlushedSequenceIdResponse(seqId);
1330   }
1331 
1332   @Override
1333   public RegionServerReportResponse regionServerReport(
1334       RpcController controller, RegionServerReportRequest request) throws ServiceException {
1335     try {
1336       ClusterStatusProtos.ServerLoad sl = request.getLoad();
1337       ServerName serverName = ProtobufUtil.toServerName(request.getServer());
1338       ServerLoad oldLoad = serverManager.getLoad(serverName);
1339       this.serverManager.regionServerReport(serverName, new ServerLoad(sl));
1340       if (sl != null && this.metricsMaster != null) {
1341         // Up our metrics.
1342         this.metricsMaster.incrementRequests(sl.getTotalNumberOfRequests()
1343           - (oldLoad != null ? oldLoad.getTotalNumberOfRequests() : 0));
1344       }
1345     } catch (IOException ioe) {
1346       throw new ServiceException(ioe);
1347     }
1348 
1349     return RegionServerReportResponse.newBuilder().build();
1350   }
1351 
1352   @Override
1353   public ReportRSFatalErrorResponse reportRSFatalError(
1354       RpcController controller, ReportRSFatalErrorRequest request) throws ServiceException {
1355     String errorText = request.getErrorMessage();
1356     ServerName sn = ProtobufUtil.toServerName(request.getServer());
1357     String msg = "Region server " + sn +
1358       " reported a fatal error:\n" + errorText;
1359     LOG.error(msg);
1360     rsFatals.add(msg);
1361 
1362     return ReportRSFatalErrorResponse.newBuilder().build();
1363   }
1364 
1365   public boolean isMasterRunning() {
1366     return !isStopped();
1367   }
1368 
1369   @Override
1370   public IsMasterRunningResponse isMasterRunning(RpcController c, IsMasterRunningRequest req)
1371   throws ServiceException {
1372     return IsMasterRunningResponse.newBuilder().setIsMasterRunning(isMasterRunning()).build();
1373   }
1374 
1375   @Override
1376   public RunCatalogScanResponse runCatalogScan(RpcController c,
1377       RunCatalogScanRequest req) throws ServiceException {
1378     try {
1379       return ResponseConverter.buildRunCatalogScanResponse(catalogJanitorChore.scan());
1380     } catch (IOException ioe) {
1381       throw new ServiceException(ioe);
1382     }
1383   }
1384 
1385   @Override
1386   public EnableCatalogJanitorResponse enableCatalogJanitor(RpcController c,
1387       EnableCatalogJanitorRequest req) throws ServiceException {
1388     return EnableCatalogJanitorResponse.newBuilder().
1389         setPrevValue(catalogJanitorChore.setEnabled(req.getEnable())).build();
1390   }
1391 
1392   @Override
1393   public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(RpcController c,
1394       IsCatalogJanitorEnabledRequest req) throws ServiceException {
1395     boolean isEnabled = catalogJanitorChore != null ? catalogJanitorChore.getEnabled() : false;
1396     return IsCatalogJanitorEnabledResponse.newBuilder().setValue(isEnabled).build();
1397   }
1398 
1399   /**
1400    * @return Maximum time we should run balancer for
1401    */
1402   private int getBalancerCutoffTime() {
1403     int balancerCutoffTime =
1404       getConfiguration().getInt("hbase.balancer.max.balancing", -1);
1405     if (balancerCutoffTime == -1) {
1406       // No time period set so create one
1407       int balancerPeriod =
1408         getConfiguration().getInt("hbase.balancer.period", 300000);
1409       balancerCutoffTime = balancerPeriod;
1410       // If nonsense period, set it to balancerPeriod
1411       if (balancerCutoffTime <= 0) balancerCutoffTime = balancerPeriod;
1412     }
1413     return balancerCutoffTime;
1414   }
1415 
1416   public boolean balance() throws HBaseIOException {
1417     // if master not initialized, don't run balancer.
1418     if (!this.initialized) {
1419       LOG.debug("Master has not been initialized, don't run balancer.");
1420       return false;
1421     }
1422     // Do this call outside of synchronized block.
1423     int maximumBalanceTime = getBalancerCutoffTime();
1424     boolean balancerRan;
1425     synchronized (this.balancer) {
1426       // If balance not true, don't run balancer.
1427       if (!this.loadBalancerTracker.isBalancerOn()) return false;
1428       // Only allow one balance run at at time.
1429       if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {
1430         Map<String, RegionState> regionsInTransition =
1431           this.assignmentManager.getRegionStates().getRegionsInTransition();
1432         LOG.debug("Not running balancer because " + regionsInTransition.size() +
1433           " region(s) in transition: " + org.apache.commons.lang.StringUtils.
1434             abbreviate(regionsInTransition.toString(), 256));
1435         return false;
1436       }
1437       if (this.serverManager.areDeadServersInProgress()) {
1438         LOG.debug("Not running balancer because processing dead regionserver(s): " +
1439           this.serverManager.getDeadServers());
1440         return false;
1441       }
1442 
1443       if (this.cpHost != null) {
1444         try {
1445           if (this.cpHost.preBalance()) {
1446             LOG.debug("Coprocessor bypassing balancer request");
1447             return false;
1448           }
1449         } catch (IOException ioe) {
1450           LOG.error("Error invoking master coprocessor preBalance()", ioe);
1451           return false;
1452         }
1453       }
1454 
1455       Map<TableName, Map<ServerName, List<HRegionInfo>>> assignmentsByTable =
1456         this.assignmentManager.getRegionStates().getAssignmentsByTable();
1457 
1458       List<RegionPlan> plans = new ArrayList<RegionPlan>();
1459       //Give the balancer the current cluster state.
1460       this.balancer.setClusterStatus(getClusterStatus());
1461       for (Map<ServerName, List<HRegionInfo>> assignments : assignmentsByTable.values()) {
1462         List<RegionPlan> partialPlans = this.balancer.balanceCluster(assignments);
1463         if (partialPlans != null) plans.addAll(partialPlans);
1464       }
1465       long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;
1466       int rpCount = 0;  // number of RegionPlans balanced so far
1467       long totalRegPlanExecTime = 0;
1468       balancerRan = plans != null;
1469       if (plans != null && !plans.isEmpty()) {
1470         for (RegionPlan plan: plans) {
1471           LOG.info("balance " + plan);
1472           long balStartTime = System.currentTimeMillis();
1473           //TODO: bulk assign
1474           this.assignmentManager.balance(plan);
1475           totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;
1476           rpCount++;
1477           if (rpCount < plans.size() &&
1478               // if performing next balance exceeds cutoff time, exit the loop
1479               (System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {
1480             //TODO: After balance, there should not be a cutoff time (keeping it as a security net for now)
1481             LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +
1482               maximumBalanceTime);
1483             break;
1484           }
1485         }
1486       }
1487       if (this.cpHost != null) {
1488         try {
1489           this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);
1490         } catch (IOException ioe) {
1491           // balancing already succeeded so don't change the result
1492           LOG.error("Error invoking master coprocessor postBalance()", ioe);
1493         }
1494       }
1495     }
1496     return balancerRan;
1497   }
1498 
1499   @Override
1500   public BalanceResponse balance(RpcController c, BalanceRequest request) throws ServiceException {
1501     try {
1502       return BalanceResponse.newBuilder().setBalancerRan(balance()).build();
1503     } catch (HBaseIOException ex) {
1504       throw new ServiceException(ex);
1505     }
1506   }
1507 
1508   enum BalanceSwitchMode {
1509     SYNC,
1510     ASYNC
1511   }
1512 
1513   /**
1514    * Assigns balancer switch according to BalanceSwitchMode
1515    * @param b new balancer switch
1516    * @param mode BalanceSwitchMode
1517    * @return old balancer switch
1518    */
1519   public boolean switchBalancer(final boolean b, BalanceSwitchMode mode) throws IOException {
1520     boolean oldValue = this.loadBalancerTracker.isBalancerOn();
1521     boolean newValue = b;
1522     try {
1523       if (this.cpHost != null) {
1524         newValue = this.cpHost.preBalanceSwitch(newValue);
1525       }
1526       try {
1527         if (mode == BalanceSwitchMode.SYNC) {
1528           synchronized (this.balancer) {
1529             this.loadBalancerTracker.setBalancerOn(newValue);
1530           }
1531         } else {
1532           this.loadBalancerTracker.setBalancerOn(newValue);
1533         }
1534       } catch (KeeperException ke) {
1535         throw new IOException(ke);
1536       }
1537       LOG.info(getClientIdAuditPrefix() + " set balanceSwitch=" + newValue);
1538       if (this.cpHost != null) {
1539         this.cpHost.postBalanceSwitch(oldValue, newValue);
1540       }
1541     } catch (IOException ioe) {
1542       LOG.warn("Error flipping balance switch", ioe);
1543     }
1544     return oldValue;
1545   }
1546 
1547   /**
1548    * @return Client info for use as prefix on an audit log string; who did an action
1549    */
1550   String getClientIdAuditPrefix() {
1551     return "Client=" + RequestContext.getRequestUserName() + "/" +
1552       RequestContext.get().getRemoteAddress();
1553   }
1554 
1555   public boolean synchronousBalanceSwitch(final boolean b) throws IOException {
1556     return switchBalancer(b, BalanceSwitchMode.SYNC);
1557   }
1558 
1559   public boolean balanceSwitch(final boolean b) throws IOException {
1560     return switchBalancer(b, BalanceSwitchMode.ASYNC);
1561   }
1562 
1563   @Override
1564   public SetBalancerRunningResponse setBalancerRunning(
1565       RpcController controller, SetBalancerRunningRequest req) throws ServiceException {
1566     try {
1567       boolean prevValue = (req.getSynchronous())?
1568         synchronousBalanceSwitch(req.getOn()):balanceSwitch(req.getOn());
1569       return SetBalancerRunningResponse.newBuilder().setPrevBalanceValue(prevValue).build();
1570     } catch (IOException ioe) {
1571       throw new ServiceException(ioe);
1572     }
1573   }
1574 
1575   /**
1576    * Switch for the background CatalogJanitor thread.
1577    * Used for testing.  The thread will continue to run.  It will just be a noop
1578    * if disabled.
1579    * @param b If false, the catalog janitor won't do anything.
1580    */
1581   public void setCatalogJanitorEnabled(final boolean b) {
1582     this.catalogJanitorChore.setEnabled(b);
1583   }
1584 
1585   @Override
1586   public DispatchMergingRegionsResponse dispatchMergingRegions(
1587       RpcController controller, DispatchMergingRegionsRequest request)
1588       throws ServiceException {
1589     final byte[] encodedNameOfRegionA = request.getRegionA().getValue()
1590         .toByteArray();
1591     final byte[] encodedNameOfRegionB = request.getRegionB().getValue()
1592         .toByteArray();
1593     final boolean forcible = request.getForcible();
1594     if (request.getRegionA().getType() != RegionSpecifierType.ENCODED_REGION_NAME
1595         || request.getRegionB().getType() != RegionSpecifierType.ENCODED_REGION_NAME) {
1596       LOG.warn("mergeRegions specifier type: expected: "
1597           + RegionSpecifierType.ENCODED_REGION_NAME + " actual: region_a="
1598           + request.getRegionA().getType() + ", region_b="
1599           + request.getRegionB().getType());
1600     }
1601     RegionState regionStateA = assignmentManager.getRegionStates()
1602         .getRegionState(Bytes.toString(encodedNameOfRegionA));
1603     RegionState regionStateB = assignmentManager.getRegionStates()
1604         .getRegionState(Bytes.toString(encodedNameOfRegionB));
1605     if (regionStateA == null || regionStateB == null) {
1606       throw new ServiceException(new UnknownRegionException(
1607           Bytes.toStringBinary(regionStateA == null ? encodedNameOfRegionA
1608               : encodedNameOfRegionB)));
1609     }
1610 
1611     if (!regionStateA.isOpened() || !regionStateB.isOpened()) {
1612       throw new ServiceException(new MergeRegionException(
1613         "Unable to merge regions not online " + regionStateA + ", " + regionStateB));
1614     }
1615 
1616     HRegionInfo regionInfoA = regionStateA.getRegion();
1617     HRegionInfo regionInfoB = regionStateB.getRegion();
1618     if (regionInfoA.compareTo(regionInfoB) == 0) {
1619       throw new ServiceException(new MergeRegionException(
1620         "Unable to merge a region to itself " + regionInfoA + ", " + regionInfoB));
1621     }
1622 
1623     if (!forcible && !HRegionInfo.areAdjacent(regionInfoA, regionInfoB)) {
1624       throw new ServiceException(new MergeRegionException(
1625         "Unable to merge not adjacent regions "
1626           + regionInfoA.getRegionNameAsString() + ", "
1627           + regionInfoB.getRegionNameAsString()
1628           + " where forcible = " + forcible));
1629     }
1630 
1631     try {
1632       dispatchMergingRegions(regionInfoA, regionInfoB, forcible);
1633     } catch (IOException ioe) {
1634       throw new ServiceException(ioe);
1635     }
1636 
1637     return DispatchMergingRegionsResponse.newBuilder().build();
1638   }
1639 
1640   @Override
1641   public void dispatchMergingRegions(final HRegionInfo region_a,
1642       final HRegionInfo region_b, final boolean forcible) throws IOException {
1643     checkInitialized();
1644     this.executorService.submit(new DispatchMergingRegionHandler(this,
1645         this.catalogJanitorChore, region_a, region_b, forcible));
1646   }
1647 
1648   @Override
1649   public MoveRegionResponse moveRegion(RpcController controller, MoveRegionRequest req)
1650   throws ServiceException {
1651     final byte [] encodedRegionName = req.getRegion().getValue().toByteArray();
1652     RegionSpecifierType type = req.getRegion().getType();
1653     final byte [] destServerName = (req.hasDestServerName())?
1654       Bytes.toBytes(ProtobufUtil.toServerName(req.getDestServerName()).getServerName()):null;
1655     MoveRegionResponse mrr = MoveRegionResponse.newBuilder().build();
1656 
1657     if (type != RegionSpecifierType.ENCODED_REGION_NAME) {
1658       LOG.warn("moveRegion specifier type: expected: " + RegionSpecifierType.ENCODED_REGION_NAME
1659         + " actual: " + type);
1660     }
1661 
1662     try {
1663       move(encodedRegionName, destServerName);
1664     } catch (HBaseIOException ioe) {
1665       throw new ServiceException(ioe);
1666     }
1667     return mrr;
1668   }
1669 
1670   void move(final byte[] encodedRegionName,
1671       final byte[] destServerName) throws HBaseIOException {
1672     RegionState regionState = assignmentManager.getRegionStates().
1673       getRegionState(Bytes.toString(encodedRegionName));
1674     if (regionState == null) {
1675       throw new UnknownRegionException(Bytes.toStringBinary(encodedRegionName));
1676     }
1677 
1678     HRegionInfo hri = regionState.getRegion();
1679     ServerName dest;
1680     if (destServerName == null || destServerName.length == 0) {
1681       LOG.info("Passed destination servername is null/empty so " +
1682         "choosing a server at random");
1683       final List<ServerName> destServers = this.serverManager.createDestinationServersList(
1684         regionState.getServerName());
1685       dest = balancer.randomAssignment(hri, destServers);
1686     } else {
1687       dest = ServerName.valueOf(Bytes.toString(destServerName));
1688       if (dest.equals(regionState.getServerName())) {
1689         LOG.debug("Skipping move of region " + hri.getRegionNameAsString()
1690           + " because region already assigned to the same server " + dest + ".");
1691         return;
1692       }
1693     }
1694 
1695     // Now we can do the move
1696     RegionPlan rp = new RegionPlan(hri, regionState.getServerName(), dest);
1697 
1698     try {
1699       checkInitialized();
1700       if (this.cpHost != null) {
1701         if (this.cpHost.preMove(hri, rp.getSource(), rp.getDestination())) {
1702           return;
1703         }
1704       }
1705       LOG.info(getClientIdAuditPrefix() + " move " + rp + ", running balancer");
1706       this.assignmentManager.balance(rp);
1707       if (this.cpHost != null) {
1708         this.cpHost.postMove(hri, rp.getSource(), rp.getDestination());
1709       }
1710     } catch (IOException ioe) {
1711       if (ioe instanceof HBaseIOException) {
1712         throw (HBaseIOException)ioe;
1713       }
1714       throw new HBaseIOException(ioe);
1715     }
1716   }
1717 
1718   @Override
1719   public void createTable(HTableDescriptor hTableDescriptor,
1720     byte [][] splitKeys)
1721   throws IOException {
1722     if (!isMasterRunning()) {
1723       throw new MasterNotRunningException();
1724     }
1725 
1726     String namespace = hTableDescriptor.getTableName().getNamespaceAsString();
1727     getNamespaceDescriptor(namespace); // ensure namespace exists
1728 
1729     HRegionInfo[] newRegions = getHRegionInfos(hTableDescriptor, splitKeys);
1730     checkInitialized();
1731     checkCompression(hTableDescriptor);
1732     if (cpHost != null) {
1733       cpHost.preCreateTable(hTableDescriptor, newRegions);
1734     }
1735     LOG.info(getClientIdAuditPrefix() + " create " + hTableDescriptor);
1736     this.executorService.submit(new CreateTableHandler(this,
1737       this.fileSystemManager, hTableDescriptor, conf,
1738       newRegions, this).prepare());
1739     if (cpHost != null) {
1740       cpHost.postCreateTable(hTableDescriptor, newRegions);
1741     }
1742 
1743   }
1744 
1745   private void checkCompression(final HTableDescriptor htd)
1746   throws IOException {
1747     if (!this.masterCheckCompression) return;
1748     for (HColumnDescriptor hcd : htd.getColumnFamilies()) {
1749       checkCompression(hcd);
1750     }
1751   }
1752 
1753   private void checkCompression(final HColumnDescriptor hcd)
1754   throws IOException {
1755     if (!this.masterCheckCompression) return;
1756     CompressionTest.testCompression(hcd.getCompression());
1757     CompressionTest.testCompression(hcd.getCompactionCompression());
1758   }
1759 
1760   @Override
1761   public CreateTableResponse createTable(RpcController controller, CreateTableRequest req)
1762   throws ServiceException {
1763     HTableDescriptor hTableDescriptor = HTableDescriptor.convert(req.getTableSchema());
1764     byte [][] splitKeys = ProtobufUtil.getSplitKeysArray(req);
1765     try {
1766       createTable(hTableDescriptor,splitKeys);
1767     } catch (IOException ioe) {
1768       throw new ServiceException(ioe);
1769     }
1770     return CreateTableResponse.newBuilder().build();
1771   }
1772 
1773   private HRegionInfo[] getHRegionInfos(HTableDescriptor hTableDescriptor,
1774     byte[][] splitKeys) {
1775     HRegionInfo[] hRegionInfos = null;
1776     if (splitKeys == null || splitKeys.length == 0) {
1777       hRegionInfos = new HRegionInfo[]{
1778           new HRegionInfo(hTableDescriptor.getTableName(), null, null)};
1779     } else {
1780       int numRegions = splitKeys.length + 1;
1781       hRegionInfos = new HRegionInfo[numRegions];
1782       byte[] startKey = null;
1783       byte[] endKey = null;
1784       for (int i = 0; i < numRegions; i++) {
1785         endKey = (i == splitKeys.length) ? null : splitKeys[i];
1786         hRegionInfos[i] =
1787             new HRegionInfo(hTableDescriptor.getTableName(), startKey, endKey);
1788         startKey = endKey;
1789       }
1790     }
1791     return hRegionInfos;
1792   }
1793 
1794   private static boolean isCatalogTable(final TableName tableName) {
1795     return tableName.equals(TableName.META_TABLE_NAME);
1796   }
1797 
1798   @Override
1799   public void deleteTable(final TableName tableName) throws IOException {
1800     checkInitialized();
1801     if (cpHost != null) {
1802       cpHost.preDeleteTable(tableName);
1803     }
1804     LOG.info(getClientIdAuditPrefix() + " delete " + tableName);
1805     this.executorService.submit(new DeleteTableHandler(tableName, this, this).prepare());
1806     if (cpHost != null) {
1807       cpHost.postDeleteTable(tableName);
1808     }
1809   }
1810 
1811   @Override
1812   public DeleteTableResponse deleteTable(RpcController controller, DeleteTableRequest request)
1813   throws ServiceException {
1814     try {
1815       deleteTable(ProtobufUtil.toTableName(request.getTableName()));
1816     } catch (IOException ioe) {
1817       throw new ServiceException(ioe);
1818     }
1819     return DeleteTableResponse.newBuilder().build();
1820   }
1821 
1822   /**
1823    * Get the number of regions of the table that have been updated by the alter.
1824    *
1825    * @return Pair indicating the number of regions updated Pair.getFirst is the
1826    *         regions that are yet to be updated Pair.getSecond is the total number
1827    *         of regions of the table
1828    * @throws IOException
1829    */
1830   @Override
1831   public GetSchemaAlterStatusResponse getSchemaAlterStatus(
1832       RpcController controller, GetSchemaAlterStatusRequest req) throws ServiceException {
1833     // TODO: currently, we query using the table name on the client side. this
1834     // may overlap with other table operations or the table operation may
1835     // have completed before querying this API. We need to refactor to a
1836     // transaction system in the future to avoid these ambiguities.
1837     TableName tableName = ProtobufUtil.toTableName(req.getTableName());
1838 
1839     try {
1840       Pair<Integer,Integer> pair = this.assignmentManager.getReopenStatus(tableName);
1841       GetSchemaAlterStatusResponse.Builder ret = GetSchemaAlterStatusResponse.newBuilder();
1842       ret.setYetToUpdateRegions(pair.getFirst());
1843       ret.setTotalRegions(pair.getSecond());
1844       return ret.build();
1845     } catch (IOException ioe) {
1846       throw new ServiceException(ioe);
1847     }
1848   }
1849 
1850   @Override
1851   public void addColumn(final TableName tableName, final HColumnDescriptor column)
1852       throws IOException {
1853     checkInitialized();
1854     if (cpHost != null) {
1855       if (cpHost.preAddColumn(tableName, column)) {
1856         return;
1857       }
1858     }
1859     //TODO: we should process this (and some others) in an executor
1860     new TableAddFamilyHandler(tableName, column, this, this).prepare().process();
1861     if (cpHost != null) {
1862       cpHost.postAddColumn(tableName, column);
1863     }
1864   }
1865 
1866   @Override
1867   public AddColumnResponse addColumn(RpcController controller, AddColumnRequest req)
1868   throws ServiceException {
1869     try {
1870       addColumn(ProtobufUtil.toTableName(req.getTableName()),
1871         HColumnDescriptor.convert(req.getColumnFamilies()));
1872     } catch (IOException ioe) {
1873       throw new ServiceException(ioe);
1874     }
1875     return AddColumnResponse.newBuilder().build();
1876   }
1877 
1878   @Override
1879   public void modifyColumn(TableName tableName, HColumnDescriptor descriptor)
1880       throws IOException {
1881     checkInitialized();
1882     checkCompression(descriptor);
1883     if (cpHost != null) {
1884       if (cpHost.preModifyColumn(tableName, descriptor)) {
1885         return;
1886       }
1887     }
1888     LOG.info(getClientIdAuditPrefix() + " modify " + descriptor);
1889     new TableModifyFamilyHandler(tableName, descriptor, this, this)
1890       .prepare().process();
1891     if (cpHost != null) {
1892       cpHost.postModifyColumn(tableName, descriptor);
1893     }
1894   }
1895 
1896   @Override
1897   public ModifyColumnResponse modifyColumn(RpcController controller, ModifyColumnRequest req)
1898   throws ServiceException {
1899     try {
1900       modifyColumn(ProtobufUtil.toTableName(req.getTableName()),
1901         HColumnDescriptor.convert(req.getColumnFamilies()));
1902     } catch (IOException ioe) {
1903       throw new ServiceException(ioe);
1904     }
1905     return ModifyColumnResponse.newBuilder().build();
1906   }
1907 
1908   @Override
1909   public void deleteColumn(final TableName tableName, final byte[] columnName)
1910       throws IOException {
1911     checkInitialized();
1912     if (cpHost != null) {
1913       if (cpHost.preDeleteColumn(tableName, columnName)) {
1914         return;
1915       }
1916     }
1917     LOG.info(getClientIdAuditPrefix() + " delete " + Bytes.toString(columnName));
1918     new TableDeleteFamilyHandler(tableName, columnName, this, this).prepare().process();
1919     if (cpHost != null) {
1920       cpHost.postDeleteColumn(tableName, columnName);
1921     }
1922   }
1923 
1924   @Override
1925   public DeleteColumnResponse deleteColumn(RpcController controller, DeleteColumnRequest req)
1926   throws ServiceException {
1927     try {
1928       deleteColumn(ProtobufUtil.toTableName(req.getTableName()),
1929           req.getColumnName().toByteArray());
1930     } catch (IOException ioe) {
1931       throw new ServiceException(ioe);
1932     }
1933     return DeleteColumnResponse.newBuilder().build();
1934   }
1935 
1936   @Override
1937   public void enableTable(final TableName tableName) throws IOException {
1938     checkInitialized();
1939     if (cpHost != null) {
1940       cpHost.preEnableTable(tableName);
1941     }
1942     LOG.info(getClientIdAuditPrefix() + " enable " + tableName);
1943     this.executorService.submit(new EnableTableHandler(this, tableName,
1944       catalogTracker, assignmentManager, tableLockManager, false).prepare());
1945     if (cpHost != null) {
1946       cpHost.postEnableTable(tableName);
1947    }
1948   }
1949 
1950   @Override
1951   public EnableTableResponse enableTable(RpcController controller, EnableTableRequest request)
1952   throws ServiceException {
1953     try {
1954       enableTable(ProtobufUtil.toTableName(request.getTableName()));
1955     } catch (IOException ioe) {
1956       throw new ServiceException(ioe);
1957     }
1958     return EnableTableResponse.newBuilder().build();
1959   }
1960 
1961   @Override
1962   public void disableTable(final TableName tableName) throws IOException {
1963     checkInitialized();
1964     if (cpHost != null) {
1965       cpHost.preDisableTable(tableName);
1966     }
1967     LOG.info(getClientIdAuditPrefix() + " disable " + tableName);
1968     this.executorService.submit(new DisableTableHandler(this, tableName,
1969       catalogTracker, assignmentManager, tableLockManager, false).prepare());
1970     if (cpHost != null) {
1971       cpHost.postDisableTable(tableName);
1972     }
1973   }
1974 
1975   @Override
1976   public DisableTableResponse disableTable(RpcController controller, DisableTableRequest request)
1977   throws ServiceException {
1978     try {
1979       disableTable(ProtobufUtil.toTableName(request.getTableName()));
1980     } catch (IOException ioe) {
1981       throw new ServiceException(ioe);
1982     }
1983     return DisableTableResponse.newBuilder().build();
1984   }
1985 
1986   /**
1987    * Return the region and current deployment for the region containing
1988    * the given row. If the region cannot be found, returns null. If it
1989    * is found, but not currently deployed, the second element of the pair
1990    * may be null.
1991    */
1992   Pair<HRegionInfo, ServerName> getTableRegionForRow(
1993       final TableName tableName, final byte [] rowKey)
1994   throws IOException {
1995     final AtomicReference<Pair<HRegionInfo, ServerName>> result =
1996       new AtomicReference<Pair<HRegionInfo, ServerName>>(null);
1997 
1998     MetaScannerVisitor visitor =
1999       new MetaScannerVisitorBase() {
2000         @Override
2001         public boolean processRow(Result data) throws IOException {
2002           if (data == null || data.size() <= 0) {
2003             return true;
2004           }
2005           Pair<HRegionInfo, ServerName> pair = HRegionInfo.getHRegionInfoAndServerName(data);
2006           if (pair == null) {
2007             return false;
2008           }
2009           if (!pair.getFirst().getTable().equals(tableName)) {
2010             return false;
2011           }
2012           result.set(pair);
2013           return true;
2014         }
2015     };
2016 
2017     MetaScanner.metaScan(conf, visitor, tableName, rowKey, 1);
2018     return result.get();
2019   }
2020 
2021   @Override
2022   public void modifyTable(final TableName tableName, final HTableDescriptor descriptor)
2023       throws IOException {
2024     checkInitialized();
2025     checkCompression(descriptor);
2026     if (cpHost != null) {
2027       cpHost.preModifyTable(tableName, descriptor);
2028     }
2029     LOG.info(getClientIdAuditPrefix() + " modify " + tableName);
2030     new ModifyTableHandler(tableName, descriptor, this, this).prepare().process();
2031     if (cpHost != null) {
2032       cpHost.postModifyTable(tableName, descriptor);
2033     }
2034   }
2035 
2036   @Override
2037   public ModifyTableResponse modifyTable(RpcController controller, ModifyTableRequest req)
2038   throws ServiceException {
2039     try {
2040       modifyTable(ProtobufUtil.toTableName(req.getTableName()),
2041         HTableDescriptor.convert(req.getTableSchema()));
2042     } catch (IOException ioe) {
2043       throw new ServiceException(ioe);
2044     }
2045     return ModifyTableResponse.newBuilder().build();
2046   }
2047 
2048   @Override
2049   public void checkTableModifiable(final TableName tableName)
2050       throws IOException, TableNotFoundException, TableNotDisabledException {
2051     if (isCatalogTable(tableName)) {
2052       throw new IOException("Can't modify catalog tables");
2053     }
2054     if (!MetaReader.tableExists(getCatalogTracker(), tableName)) {
2055       throw new TableNotFoundException(tableName);
2056     }
2057     if (!getAssignmentManager().getZKTable().
2058         isDisabledTable(tableName)) {
2059       throw new TableNotDisabledException(tableName);
2060     }
2061   }
2062 
2063   @Override
2064   public GetClusterStatusResponse getClusterStatus(RpcController controller,
2065       GetClusterStatusRequest req)
2066   throws ServiceException {
2067     GetClusterStatusResponse.Builder response = GetClusterStatusResponse.newBuilder();
2068     response.setClusterStatus(getClusterStatus().convert());
2069     return response.build();
2070   }
2071 
2072   /**
2073    * @return cluster status
2074    */
2075   public ClusterStatus getClusterStatus() {
2076     // Build Set of backup masters from ZK nodes
2077     List<String> backupMasterStrings;
2078     try {
2079       backupMasterStrings = ZKUtil.listChildrenNoWatch(this.zooKeeper,
2080         this.zooKeeper.backupMasterAddressesZNode);
2081     } catch (KeeperException e) {
2082       LOG.warn(this.zooKeeper.prefix("Unable to list backup servers"), e);
2083       backupMasterStrings = new ArrayList<String>(0);
2084     }
2085     List<ServerName> backupMasters = new ArrayList<ServerName>(
2086                                           backupMasterStrings.size());
2087     for (String s: backupMasterStrings) {
2088       try {
2089         byte [] bytes =
2090             ZKUtil.getData(this.zooKeeper, ZKUtil.joinZNode(
2091                 this.zooKeeper.backupMasterAddressesZNode, s));
2092         if (bytes != null) {
2093           ServerName sn;
2094           try {
2095             sn = ServerName.parseFrom(bytes);
2096           } catch (DeserializationException e) {
2097             LOG.warn("Failed parse, skipping registering backup server", e);
2098             continue;
2099           }
2100           backupMasters.add(sn);
2101         }
2102       } catch (KeeperException e) {
2103         LOG.warn(this.zooKeeper.prefix("Unable to get information about " +
2104                  "backup servers"), e);
2105       }
2106     }
2107     Collections.sort(backupMasters, new Comparator<ServerName>() {
2108       @Override
2109       public int compare(ServerName s1, ServerName s2) {
2110         return s1.getServerName().compareTo(s2.getServerName());
2111       }});
2112 
2113     return new ClusterStatus(VersionInfo.getVersion(),
2114       this.fileSystemManager.getClusterId().toString(),
2115       this.serverManager.getOnlineServers(),
2116       this.serverManager.getDeadServers().copyServerNames(),
2117       this.serverName,
2118       backupMasters,
2119       this.assignmentManager.getRegionStates().getRegionsInTransition(),
2120       this.getCoprocessors(), this.loadBalancerTracker.isBalancerOn());
2121   }
2122 
2123   public String getClusterId() {
2124     if (fileSystemManager == null) {
2125       return "";
2126     }
2127     ClusterId id = fileSystemManager.getClusterId();
2128     if (id == null) {
2129       return "";
2130     }
2131     return id.toString();
2132   }
2133 
2134   /**
2135    * The set of loaded coprocessors is stored in a static set. Since it's
2136    * statically allocated, it does not require that HMaster's cpHost be
2137    * initialized prior to accessing it.
2138    * @return a String representation of the set of names of the loaded
2139    * coprocessors.
2140    */
2141   public static String getLoadedCoprocessors() {
2142     return CoprocessorHost.getLoadedCoprocessors().toString();
2143   }
2144 
2145   /**
2146    * @return timestamp in millis when HMaster was started.
2147    */
2148   public long getMasterStartTime() {
2149     return masterStartTime;
2150   }
2151 
2152   /**
2153    * @return timestamp in millis when HMaster became the active master.
2154    */
2155   public long getMasterActiveTime() {
2156     return masterActiveTime;
2157   }
2158 
2159   public int getRegionServerInfoPort(final ServerName sn) {
2160     RegionServerInfo info = this.regionServerTracker.getRegionServerInfo(sn);
2161     if (info == null || info.getInfoPort() == 0) {
2162       return conf.getInt(HConstants.REGIONSERVER_INFO_PORT,
2163         HConstants.DEFAULT_REGIONSERVER_INFOPORT);
2164     }
2165     return info.getInfoPort();
2166   }
2167 
2168   /**
2169    * @return array of coprocessor SimpleNames.
2170    */
2171   public String[] getCoprocessors() {
2172     Set<String> masterCoprocessors =
2173         getCoprocessorHost().getCoprocessors();
2174     return masterCoprocessors.toArray(new String[masterCoprocessors.size()]);
2175   }
2176 
2177   @Override
2178   public void abort(final String msg, final Throwable t) {
2179     if (cpHost != null) {
2180       // HBASE-4014: dump a list of loaded coprocessors.
2181       LOG.fatal("Master server abort: loaded coprocessors are: " +
2182           getLoadedCoprocessors());
2183     }
2184 
2185     if (abortNow(msg, t)) {
2186       if (t != null) LOG.fatal(msg, t);
2187       else LOG.fatal(msg);
2188       this.abort = true;
2189       stop("Aborting");
2190     }
2191   }
2192 
2193   /**
2194    * We do the following in a different thread.  If it is not completed
2195    * in time, we will time it out and assume it is not easy to recover.
2196    *
2197    * 1. Create a new ZK session. (since our current one is expired)
2198    * 2. Try to become a primary master again
2199    * 3. Initialize all ZK based system trackers.
2200    * 4. Assign meta. (they are already assigned, but we need to update our
2201    * internal memory state to reflect it)
2202    * 5. Process any RIT if any during the process of our recovery.
2203    *
2204    * @return True if we could successfully recover from ZK session expiry.
2205    * @throws InterruptedException
2206    * @throws IOException
2207    * @throws KeeperException
2208    * @throws ExecutionException
2209    */
2210   private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
2211       IOException, KeeperException, ExecutionException {
2212 
2213     this.zooKeeper.unregisterAllListeners();
2214     // add back listeners which were registered before master initialization
2215     // because they won't be added back in below Master re-initialization code
2216     if (this.registeredZKListenersBeforeRecovery != null) {
2217       for (ZooKeeperListener curListener : this.registeredZKListenersBeforeRecovery) {
2218         this.zooKeeper.registerListener(curListener);
2219       }
2220     }
2221 
2222     this.zooKeeper.reconnectAfterExpiration();
2223 
2224     Callable<Boolean> callable = new Callable<Boolean> () {
2225       @Override
2226       public Boolean call() throws InterruptedException,
2227           IOException, KeeperException {
2228         MonitoredTask status =
2229           TaskMonitor.get().createStatus("Recovering expired ZK session");
2230         try {
2231           if (!becomeActiveMaster(status)) {
2232             return Boolean.FALSE;
2233           }
2234           serverShutdownHandlerEnabled = false;
2235           initialized = false;
2236           finishInitialization(status, true);
2237           return !stopped;
2238         } finally {
2239           status.cleanup();
2240         }
2241       }
2242     };
2243 
2244     long timeout =
2245       conf.getLong("hbase.master.zksession.recover.timeout", 300000);
2246     java.util.concurrent.ExecutorService executor =
2247       Executors.newSingleThreadExecutor();
2248     Future<Boolean> result = executor.submit(callable);
2249     executor.shutdown();
2250     if (executor.awaitTermination(timeout, TimeUnit.MILLISECONDS)
2251         && result.isDone()) {
2252       Boolean recovered = result.get();
2253       if (recovered != null) {
2254         return recovered.booleanValue();
2255       }
2256     }
2257     executor.shutdownNow();
2258     return false;
2259   }
2260 
2261   /**
2262    * Check to see if the current trigger for abort is due to ZooKeeper session
2263    * expiry, and If yes, whether we can recover from ZK session expiry.
2264    *
2265    * @param msg Original abort message
2266    * @param t   The cause for current abort request
2267    * @return true if we should proceed with abort operation, false other wise.
2268    */
2269   private boolean abortNow(final String msg, final Throwable t) {
2270     if (!this.isActiveMaster || this.stopped) {
2271       return true;
2272     }
2273 
2274     boolean failFast = conf.getBoolean("fail.fast.expired.active.master", false);
2275     if (t != null && t instanceof KeeperException.SessionExpiredException
2276         && !failFast) {
2277       try {
2278         LOG.info("Primary Master trying to recover from ZooKeeper session " +
2279             "expiry.");
2280         return !tryRecoveringExpiredZKSession();
2281       } catch (Throwable newT) {
2282         LOG.error("Primary master encountered unexpected exception while " +
2283             "trying to recover from ZooKeeper session" +
2284             " expiry. Proceeding with server abort.", newT);
2285       }
2286     }
2287     return true;
2288   }
2289 
2290   @Override
2291   public ZooKeeperWatcher getZooKeeper() {
2292     return zooKeeper;
2293   }
2294 
2295   @Override
2296   public MasterCoprocessorHost getCoprocessorHost() {
2297     return cpHost;
2298   }
2299 
2300   @Override
2301   public ServerName getServerName() {
2302     return this.serverName;
2303   }
2304 
2305   @Override
2306   public CatalogTracker getCatalogTracker() {
2307     return catalogTracker;
2308   }
2309 
2310   @Override
2311   public AssignmentManager getAssignmentManager() {
2312     return this.assignmentManager;
2313   }
2314 
2315   @Override
2316   public TableLockManager getTableLockManager() {
2317     return this.tableLockManager;
2318   }
2319 
2320   public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
2321     return rsFatals;
2322   }
2323 
2324   public void shutdown() {
2325     if (spanReceiverHost != null) {
2326       spanReceiverHost.closeReceivers();
2327     }
2328     if (cpHost != null) {
2329       try {
2330         cpHost.preShutdown();
2331       } catch (IOException ioe) {
2332         LOG.error("Error call master coprocessor preShutdown()", ioe);
2333       }
2334     }
2335     if (mxBean != null) {
2336       MBeanUtil.unregisterMBean(mxBean);
2337       mxBean = null;
2338     }
2339     if (this.assignmentManager != null) this.assignmentManager.shutdown();
2340     if (this.serverManager != null) this.serverManager.shutdownCluster();
2341     try {
2342       if (this.clusterStatusTracker != null){
2343         this.clusterStatusTracker.setClusterDown();
2344       }
2345     } catch (KeeperException e) {
2346       LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
2347     }
2348   }
2349 
2350   @Override
2351   public ShutdownResponse shutdown(RpcController controller, ShutdownRequest request)
2352   throws ServiceException {
2353     LOG.info(getClientIdAuditPrefix() + " shutdown");
2354     shutdown();
2355     return ShutdownResponse.newBuilder().build();
2356   }
2357 
2358   public void stopMaster() {
2359     if (cpHost != null) {
2360       try {
2361         cpHost.preStopMaster();
2362       } catch (IOException ioe) {
2363         LOG.error("Error call master coprocessor preStopMaster()", ioe);
2364       }
2365     }
2366     stop("Stopped by " + Thread.currentThread().getName());
2367   }
2368 
2369   @Override
2370   public StopMasterResponse stopMaster(RpcController controller, StopMasterRequest request)
2371   throws ServiceException {
2372     LOG.info(getClientIdAuditPrefix() + " stop");
2373     stopMaster();
2374     return StopMasterResponse.newBuilder().build();
2375   }
2376 
2377   @Override
2378   public void stop(final String why) {
2379     LOG.info(why);
2380     this.stopped = true;
2381     // We wake up the stopSleeper to stop immediately
2382     stopSleeper.skipSleepCycle();
2383     // If we are a backup master, we need to interrupt wait
2384     if (this.activeMasterManager != null) {
2385       synchronized (this.activeMasterManager.clusterHasActiveMaster) {
2386         this.activeMasterManager.clusterHasActiveMaster.notifyAll();
2387       }
2388     }
2389     // If no region server is online then master may stuck waiting on hbase:meta to come on line.
2390     // See HBASE-8422.
2391     if (this.catalogTracker != null && this.serverManager.getOnlineServers().isEmpty()) {
2392       this.catalogTracker.stop();
2393     }
2394   }
2395 
2396   @Override
2397   public boolean isStopped() {
2398     return this.stopped;
2399   }
2400 
2401   @Override
2402   public boolean isAborted() {
2403     return this.abort;
2404   }
2405 
2406   void checkInitialized() throws PleaseHoldException {
2407     if (!this.initialized) {
2408       throw new PleaseHoldException("Master is initializing");
2409     }
2410   }
2411 
2412   /**
2413    * Report whether this master is currently the active master or not.
2414    * If not active master, we are parked on ZK waiting to become active.
2415    *
2416    * This method is used for testing.
2417    *
2418    * @return true if active master, false if not.
2419    */
2420   public boolean isActiveMaster() {
2421     return isActiveMaster;
2422   }
2423 
2424   /**
2425    * Report whether this master has completed with its initialization and is
2426    * ready.  If ready, the master is also the active master.  A standby master
2427    * is never ready.
2428    *
2429    * This method is used for testing.
2430    *
2431    * @return true if master is ready to go, false if not.
2432    */
2433   @Override
2434   public boolean isInitialized() {
2435     return initialized;
2436   }
2437 
2438   /**
2439    * ServerShutdownHandlerEnabled is set false before completing
2440    * assignMeta to prevent processing of ServerShutdownHandler.
2441    * @return true if assignMeta has completed;
2442    */
2443   @Override
2444   public boolean isServerShutdownHandlerEnabled() {
2445     return this.serverShutdownHandlerEnabled;
2446   }
2447 
2448   /**
2449    * Report whether this master has started initialization and is about to do meta region assignment
2450    * @return true if master is in initialization & about to assign hbase:meta regions
2451    */
2452   public boolean isInitializationStartsMetaRegionAssignment() {
2453     return this.initializationBeforeMetaAssignment;
2454   }
2455 
2456   @Override
2457   public AssignRegionResponse assignRegion(RpcController controller, AssignRegionRequest req)
2458   throws ServiceException {
2459     try {
2460       final byte [] regionName = req.getRegion().getValue().toByteArray();
2461       RegionSpecifierType type = req.getRegion().getType();
2462       AssignRegionResponse arr = AssignRegionResponse.newBuilder().build();
2463 
2464       checkInitialized();
2465       if (type != RegionSpecifierType.REGION_NAME) {
2466         LOG.warn("assignRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2467           + " actual: " + type);
2468       }
2469       HRegionInfo regionInfo = assignmentManager.getRegionStates().getRegionInfo(regionName);
2470       if (regionInfo == null) throw new UnknownRegionException(Bytes.toString(regionName));
2471       if (cpHost != null) {
2472         if (cpHost.preAssign(regionInfo)) {
2473           return arr;
2474         }
2475       }
2476       LOG.info(getClientIdAuditPrefix() + " assign " + regionInfo.getRegionNameAsString());
2477       assignmentManager.assign(regionInfo, true, true);
2478       if (cpHost != null) {
2479         cpHost.postAssign(regionInfo);
2480       }
2481 
2482       return arr;
2483     } catch (IOException ioe) {
2484       throw new ServiceException(ioe);
2485     }
2486   }
2487 
2488   public void assignRegion(HRegionInfo hri) {
2489     assignmentManager.assign(hri, true);
2490   }
2491 
2492   @Override
2493   public UnassignRegionResponse unassignRegion(RpcController controller, UnassignRegionRequest req)
2494   throws ServiceException {
2495     try {
2496       final byte [] regionName = req.getRegion().getValue().toByteArray();
2497       RegionSpecifierType type = req.getRegion().getType();
2498       final boolean force = req.getForce();
2499       UnassignRegionResponse urr = UnassignRegionResponse.newBuilder().build();
2500 
2501       checkInitialized();
2502       if (type != RegionSpecifierType.REGION_NAME) {
2503         LOG.warn("unassignRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2504           + " actual: " + type);
2505       }
2506       Pair<HRegionInfo, ServerName> pair =
2507         MetaReader.getRegion(this.catalogTracker, regionName);
2508       if (pair == null) throw new UnknownRegionException(Bytes.toString(regionName));
2509       HRegionInfo hri = pair.getFirst();
2510       if (cpHost != null) {
2511         if (cpHost.preUnassign(hri, force)) {
2512           return urr;
2513         }
2514       }
2515       LOG.debug(getClientIdAuditPrefix() + " unassign " + hri.getRegionNameAsString()
2516           + " in current location if it is online and reassign.force=" + force);
2517       this.assignmentManager.unassign(hri, force);
2518       if (this.assignmentManager.getRegionStates().isRegionOffline(hri)) {
2519         LOG.debug("Region " + hri.getRegionNameAsString()
2520             + " is not online on any region server, reassigning it.");
2521         assignRegion(hri);
2522       }
2523       if (cpHost != null) {
2524         cpHost.postUnassign(hri, force);
2525       }
2526 
2527       return urr;
2528     } catch (IOException ioe) {
2529       throw new ServiceException(ioe);
2530     }
2531   }
2532 
2533   /**
2534    * Get list of TableDescriptors for requested tables.
2535    * @param controller Unused (set to null).
2536    * @param req GetTableDescriptorsRequest that contains:
2537    * - tableNames: requested tables, or if empty, all are requested
2538    * @return GetTableDescriptorsResponse
2539    * @throws ServiceException
2540    */
2541   @Override
2542   public GetTableDescriptorsResponse getTableDescriptors(
2543 	      RpcController controller, GetTableDescriptorsRequest req) throws ServiceException {
2544     List<HTableDescriptor> descriptors = new ArrayList<HTableDescriptor>();
2545     List<TableName> tableNameList = new ArrayList<TableName>();
2546     for(HBaseProtos.TableName tableNamePB: req.getTableNamesList()) {
2547       tableNameList.add(ProtobufUtil.toTableName(tableNamePB));
2548     }
2549     boolean bypass = false;
2550     if (this.cpHost != null) {
2551       try {
2552         bypass = this.cpHost.preGetTableDescriptors(tableNameList, descriptors);
2553       } catch (IOException ioe) {
2554         throw new ServiceException(ioe);
2555       }
2556     }
2557 
2558     if (!bypass) {
2559       if (req.getTableNamesCount() == 0) {
2560         // request for all TableDescriptors
2561         Map<String, HTableDescriptor> descriptorMap = null;
2562         try {
2563           descriptorMap = this.tableDescriptors.getAll();
2564         } catch (IOException e) {
2565           LOG.warn("Failed getting all descriptors", e);
2566         }
2567         if (descriptorMap != null) {
2568           for(HTableDescriptor desc: descriptorMap.values()) {
2569             if(!desc.getTableName().isSystemTable()) {
2570               descriptors.add(desc);
2571             }
2572           }
2573         }
2574       } else {
2575         for (TableName s: tableNameList) {
2576           try {
2577             HTableDescriptor desc = this.tableDescriptors.get(s);
2578             if (desc != null) {
2579               descriptors.add(desc);
2580             }
2581           } catch (IOException e) {
2582             LOG.warn("Failed getting descriptor for " + s, e);
2583           }
2584         }
2585       }
2586 
2587       if (this.cpHost != null) {
2588         try {
2589           this.cpHost.postGetTableDescriptors(descriptors);
2590         } catch (IOException ioe) {
2591           throw new ServiceException(ioe);
2592         }
2593       }
2594     }
2595 
2596     GetTableDescriptorsResponse.Builder builder = GetTableDescriptorsResponse.newBuilder();
2597     for (HTableDescriptor htd: descriptors) {
2598       builder.addTableSchema(htd.convert());
2599     }
2600     return builder.build();
2601   }
2602 
2603   /**
2604    * Get list of userspace table names
2605    * @param controller Unused (set to null).
2606    * @param req GetTableNamesRequest
2607    * @return GetTableNamesResponse
2608    * @throws ServiceException
2609    */
2610   @Override
2611   public GetTableNamesResponse getTableNames(
2612         RpcController controller, GetTableNamesRequest req) throws ServiceException {
2613     try {
2614       Collection<HTableDescriptor> descriptors = this.tableDescriptors.getAll().values();
2615       GetTableNamesResponse.Builder builder = GetTableNamesResponse.newBuilder();
2616       for (HTableDescriptor descriptor: descriptors) {
2617         if (descriptor.getTableName().isSystemTable()) {
2618           continue;
2619         }
2620         builder.addTableNames(ProtobufUtil.toProtoTableName(descriptor.getTableName()));
2621       }
2622       return builder.build();
2623     } catch (IOException e) {
2624       throw new ServiceException(e);
2625     }
2626   }
2627 
2628   /**
2629    * Compute the average load across all region servers.
2630    * Currently, this uses a very naive computation - just uses the number of
2631    * regions being served, ignoring stats about number of requests.
2632    * @return the average load
2633    */
2634   public double getAverageLoad() {
2635     if (this.assignmentManager == null) {
2636       return 0;
2637     }
2638 
2639     RegionStates regionStates = this.assignmentManager.getRegionStates();
2640     if (regionStates == null) {
2641       return 0;
2642     }
2643     return regionStates.getAverageLoad();
2644   }
2645 
2646   /**
2647    * Offline specified region from master's in-memory state. It will not attempt to
2648    * reassign the region as in unassign.
2649    *
2650    * This is a special method that should be used by experts or hbck.
2651    *
2652    */
2653   @Override
2654   public OfflineRegionResponse offlineRegion(RpcController controller, OfflineRegionRequest request)
2655   throws ServiceException {
2656     final byte [] regionName = request.getRegion().getValue().toByteArray();
2657     RegionSpecifierType type = request.getRegion().getType();
2658     if (type != RegionSpecifierType.REGION_NAME) {
2659       LOG.warn("moveRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2660         + " actual: " + type);
2661     }
2662 
2663     try {
2664       Pair<HRegionInfo, ServerName> pair =
2665         MetaReader.getRegion(this.catalogTracker, regionName);
2666       if (pair == null) throw new UnknownRegionException(Bytes.toStringBinary(regionName));
2667       HRegionInfo hri = pair.getFirst();
2668       if (cpHost != null) {
2669         cpHost.preRegionOffline(hri);
2670       }
2671       LOG.info(getClientIdAuditPrefix() + " offline " + hri.getRegionNameAsString());
2672       this.assignmentManager.regionOffline(hri);
2673       if (cpHost != null) {
2674         cpHost.postRegionOffline(hri);
2675       }
2676     } catch (IOException ioe) {
2677       throw new ServiceException(ioe);
2678     }
2679     return OfflineRegionResponse.newBuilder().build();
2680   }
2681 
2682   @Override
2683   public boolean registerService(Service instance) {
2684     /*
2685      * No stacking of instances is allowed for a single service name
2686      */
2687     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
2688     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
2689       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
2690           " already registered, rejecting request from "+instance
2691       );
2692       return false;
2693     }
2694 
2695     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
2696     if (LOG.isDebugEnabled()) {
2697       LOG.debug("Registered master coprocessor service: service="+serviceDesc.getFullName());
2698     }
2699     return true;
2700   }
2701 
2702   @Override
2703   public ClientProtos.CoprocessorServiceResponse execMasterService(final RpcController controller,
2704       final ClientProtos.CoprocessorServiceRequest request) throws ServiceException {
2705     try {
2706       ServerRpcController execController = new ServerRpcController();
2707 
2708       ClientProtos.CoprocessorServiceCall call = request.getCall();
2709       String serviceName = call.getServiceName();
2710       String methodName = call.getMethodName();
2711       if (!coprocessorServiceHandlers.containsKey(serviceName)) {
2712         throw new UnknownProtocolException(null,
2713             "No registered master coprocessor service found for name "+serviceName);
2714       }
2715 
2716       Service service = coprocessorServiceHandlers.get(serviceName);
2717       Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
2718       Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
2719       if (methodDesc == null) {
2720         throw new UnknownProtocolException(service.getClass(),
2721             "Unknown method "+methodName+" called on master service "+serviceName);
2722       }
2723 
2724       //invoke the method
2725       Message execRequest = service.getRequestPrototype(methodDesc).newBuilderForType()
2726           .mergeFrom(call.getRequest()).build();
2727       final Message.Builder responseBuilder =
2728           service.getResponsePrototype(methodDesc).newBuilderForType();
2729       service.callMethod(methodDesc, execController, execRequest, new RpcCallback<Message>() {
2730         @Override
2731         public void run(Message message) {
2732           if (message != null) {
2733             responseBuilder.mergeFrom(message);
2734           }
2735         }
2736       });
2737       Message execResult = responseBuilder.build();
2738 
2739       if (execController.getFailedOn() != null) {
2740         throw execController.getFailedOn();
2741       }
2742       ClientProtos.CoprocessorServiceResponse.Builder builder =
2743           ClientProtos.CoprocessorServiceResponse.newBuilder();
2744       builder.setRegion(RequestConverter.buildRegionSpecifier(
2745           RegionSpecifierType.REGION_NAME, HConstants.EMPTY_BYTE_ARRAY));
2746       builder.setValue(
2747           builder.getValueBuilder().setName(execResult.getClass().getName())
2748               .setValue(execResult.toByteString()));
2749       return builder.build();
2750     } catch (IOException ie) {
2751       throw new ServiceException(ie);
2752     }
2753   }
2754 
2755   /**
2756    * Utility for constructing an instance of the passed HMaster class.
2757    * @param masterClass
2758    * @param conf
2759    * @return HMaster instance.
2760    */
2761   public static HMaster constructMaster(Class<? extends HMaster> masterClass,
2762       final Configuration conf)  {
2763     try {
2764       Constructor<? extends HMaster> c =
2765         masterClass.getConstructor(Configuration.class);
2766       return c.newInstance(conf);
2767     } catch (InvocationTargetException ite) {
2768       Throwable target = ite.getTargetException() != null?
2769         ite.getTargetException(): ite;
2770       if (target.getCause() != null) target = target.getCause();
2771       throw new RuntimeException("Failed construction of Master: " +
2772         masterClass.toString(), target);
2773     } catch (Exception e) {
2774       throw new RuntimeException("Failed construction of Master: " +
2775         masterClass.toString() + ((e.getCause() != null)?
2776           e.getCause().getMessage(): ""), e);
2777     }
2778   }
2779 
2780   /**
2781    * @see org.apache.hadoop.hbase.master.HMasterCommandLine
2782    */
2783   public static void main(String [] args) {
2784     VersionInfo.logVersion();
2785     new HMasterCommandLine(HMaster.class).doMain(args);
2786   }
2787 
2788   public HFileCleaner getHFileCleaner() {
2789     return this.hfileCleaner;
2790   }
2791 
2792   /**
2793    * Exposed for TESTING!
2794    * @return the underlying snapshot manager
2795    */
2796   public SnapshotManager getSnapshotManagerForTesting() {
2797     return this.snapshotManager;
2798   }
2799 
2800   /**
2801    * Triggers an asynchronous attempt to take a snapshot.
2802    * {@inheritDoc}
2803    */
2804   @Override
2805   public SnapshotResponse snapshot(RpcController controller, SnapshotRequest request)
2806       throws ServiceException {
2807     try {
2808       this.snapshotManager.checkSnapshotSupport();
2809     } catch (UnsupportedOperationException e) {
2810       throw new ServiceException(e);
2811     }
2812 
2813     LOG.info(getClientIdAuditPrefix() + " snapshot request for:" +
2814         ClientSnapshotDescriptionUtils.toString(request.getSnapshot()));
2815     // get the snapshot information
2816     SnapshotDescription snapshot = SnapshotDescriptionUtils.validate(request.getSnapshot(),
2817       this.conf);
2818     try {
2819       snapshotManager.takeSnapshot(snapshot);
2820     } catch (IOException e) {
2821       throw new ServiceException(e);
2822     }
2823 
2824     // send back the max amount of time the client should wait for the snapshot to complete
2825     long waitTime = SnapshotDescriptionUtils.getMaxMasterTimeout(conf, snapshot.getType(),
2826       SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME);
2827     return SnapshotResponse.newBuilder().setExpectedTimeout(waitTime).build();
2828   }
2829 
2830   /**
2831    * List the currently available/stored snapshots. Any in-progress snapshots are ignored
2832    */
2833   @Override
2834   public GetCompletedSnapshotsResponse getCompletedSnapshots(RpcController controller,
2835       GetCompletedSnapshotsRequest request) throws ServiceException {
2836     try {
2837       GetCompletedSnapshotsResponse.Builder builder = GetCompletedSnapshotsResponse.newBuilder();
2838       List<SnapshotDescription> snapshots = snapshotManager.getCompletedSnapshots();
2839 
2840       // convert to protobuf
2841       for (SnapshotDescription snapshot : snapshots) {
2842         builder.addSnapshots(snapshot);
2843       }
2844       return builder.build();
2845     } catch (IOException e) {
2846       throw new ServiceException(e);
2847     }
2848   }
2849 
2850   /**
2851    * Execute Delete Snapshot operation.
2852    * @return DeleteSnapshotResponse (a protobuf wrapped void) if the snapshot existed and was
2853    *    deleted properly.
2854    * @throws ServiceException wrapping SnapshotDoesNotExistException if specified snapshot did not
2855    *    exist.
2856    */
2857   @Override
2858   public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2859       DeleteSnapshotRequest request) throws ServiceException {
2860     try {
2861       this.snapshotManager.checkSnapshotSupport();
2862     } catch (UnsupportedOperationException e) {
2863       throw new ServiceException(e);
2864     }
2865 
2866     try {
2867       LOG.info(getClientIdAuditPrefix() + " delete " + request.getSnapshot());
2868       snapshotManager.deleteSnapshot(request.getSnapshot());
2869       return DeleteSnapshotResponse.newBuilder().build();
2870     } catch (IOException e) {
2871       throw new ServiceException(e);
2872     }
2873   }
2874 
2875   /**
2876    * Checks if the specified snapshot is done.
2877    * @return true if the snapshot is in file system ready to use,
2878    *   false if the snapshot is in the process of completing
2879    * @throws ServiceException wrapping UnknownSnapshotException if invalid snapshot, or
2880    *  a wrapped HBaseSnapshotException with progress failure reason.
2881    */
2882   @Override
2883   public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2884       IsSnapshotDoneRequest request) throws ServiceException {
2885     LOG.debug("Checking to see if snapshot from request:" +
2886         ClientSnapshotDescriptionUtils.toString(request.getSnapshot()) + " is done");
2887     try {
2888       IsSnapshotDoneResponse.Builder builder = IsSnapshotDoneResponse.newBuilder();
2889       boolean done = snapshotManager.isSnapshotDone(request.getSnapshot());
2890       builder.setDone(done);
2891       return builder.build();
2892     } catch (IOException e) {
2893       throw new ServiceException(e);
2894     }
2895   }
2896 
2897   /**
2898    * Execute Restore/Clone snapshot operation.
2899    *
2900    * <p>If the specified table exists a "Restore" is executed, replacing the table
2901    * schema and directory data with the content of the snapshot.
2902    * The table must be disabled, or a UnsupportedOperationException will be thrown.
2903    *
2904    * <p>If the table doesn't exist a "Clone" is executed, a new table is created
2905    * using the schema at the time of the snapshot, and the content of the snapshot.
2906    *
2907    * <p>The restore/clone operation does not require copying HFiles. Since HFiles
2908    * are immutable the table can point to and use the same files as the original one.
2909    */
2910   @Override
2911   public RestoreSnapshotResponse restoreSnapshot(RpcController controller,
2912       RestoreSnapshotRequest request) throws ServiceException {
2913     try {
2914       this.snapshotManager.checkSnapshotSupport();
2915     } catch (UnsupportedOperationException e) {
2916       throw new ServiceException(e);
2917     }
2918 
2919     // ensure namespace exists
2920     try {
2921       TableName dstTable = TableName.valueOf(request.getSnapshot().getTable());
2922       getNamespaceDescriptor(dstTable.getNamespaceAsString());
2923     } catch (IOException ioe) {
2924       throw new ServiceException(ioe);
2925     }
2926 
2927     try {
2928       SnapshotDescription reqSnapshot = request.getSnapshot();
2929       snapshotManager.restoreSnapshot(reqSnapshot);
2930       return RestoreSnapshotResponse.newBuilder().build();
2931     } catch (IOException e) {
2932       throw new ServiceException(e);
2933     }
2934   }
2935 
2936   /**
2937    * Returns the status of the requested snapshot restore/clone operation.
2938    * This method is not exposed to the user, it is just used internally by HBaseAdmin
2939    * to verify if the restore is completed.
2940    *
2941    * No exceptions are thrown if the restore is not running, the result will be "done".
2942    *
2943    * @return done <tt>true</tt> if the restore/clone operation is completed.
2944    * @throws ServiceException if the operation failed.
2945    */
2946   @Override
2947   public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(RpcController controller,
2948       IsRestoreSnapshotDoneRequest request) throws ServiceException {
2949     try {
2950       SnapshotDescription snapshot = request.getSnapshot();
2951       IsRestoreSnapshotDoneResponse.Builder builder = IsRestoreSnapshotDoneResponse.newBuilder();
2952       boolean done = snapshotManager.isRestoreDone(snapshot);
2953       builder.setDone(done);
2954       return builder.build();
2955     } catch (IOException e) {
2956       throw new ServiceException(e);
2957     }
2958   }
2959 
2960   /**
2961    * Triggers an asynchronous attempt to run a distributed procedure.
2962    * {@inheritDoc}
2963    */
2964   @Override
2965   public ExecProcedureResponse execProcedure(RpcController controller,
2966       ExecProcedureRequest request) throws ServiceException {
2967     ProcedureDescription desc = request.getProcedure();
2968     MasterProcedureManager mpm = this.mpmHost.getProcedureManager(desc
2969         .getSignature());
2970     if (mpm == null) {
2971       throw new ServiceException("The procedure is not registered: "
2972           + desc.getSignature());
2973     }
2974 
2975     LOG.info(getClientIdAuditPrefix() + " procedure request for: "
2976         + desc.getSignature());
2977 
2978     try {
2979       mpm.execProcedure(desc);
2980     } catch (IOException e) {
2981       throw new ServiceException(e);
2982     }
2983 
2984     // send back the max amount of time the client should wait for the procedure
2985     // to complete
2986     long waitTime = SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME;
2987     return ExecProcedureResponse.newBuilder().setExpectedTimeout(waitTime)
2988         .build();
2989   }
2990 
2991   /**
2992    * Checks if the specified procedure is done.
2993    * @return true if the procedure is done,
2994    *   false if the procedure is in the process of completing
2995    * @throws ServiceException if invalid procedure, or
2996    *  a failed procedure with progress failure reason.
2997    */
2998   @Override
2999   public IsProcedureDoneResponse isProcedureDone(RpcController controller,
3000       IsProcedureDoneRequest request) throws ServiceException {
3001     ProcedureDescription desc = request.getProcedure();
3002     MasterProcedureManager mpm = this.mpmHost.getProcedureManager(desc
3003         .getSignature());
3004     if (mpm == null) {
3005       throw new ServiceException("The procedure is not registered: "
3006           + desc.getSignature());
3007     }
3008     LOG.debug("Checking to see if procedure from request:"
3009         + desc.getSignature() + " is done");
3010 
3011     try {
3012       IsProcedureDoneResponse.Builder builder = IsProcedureDoneResponse
3013           .newBuilder();
3014       boolean done = mpm.isProcedureDone(desc);
3015       builder.setDone(done);
3016       return builder.build();
3017     } catch (IOException e) {
3018       throw new ServiceException(e);
3019     }
3020   }
3021 
3022   @Override
3023   public ModifyNamespaceResponse modifyNamespace(RpcController controller,
3024       ModifyNamespaceRequest request) throws ServiceException {
3025     try {
3026       modifyNamespace(ProtobufUtil.toNamespaceDescriptor(request.getNamespaceDescriptor()));
3027       return ModifyNamespaceResponse.getDefaultInstance();
3028     } catch (IOException e) {
3029       throw new ServiceException(e);
3030     }
3031   }
3032 
3033   @Override
3034   public CreateNamespaceResponse createNamespace(RpcController controller,
3035      CreateNamespaceRequest request) throws ServiceException {
3036     try {
3037       createNamespace(ProtobufUtil.toNamespaceDescriptor(request.getNamespaceDescriptor()));
3038       return CreateNamespaceResponse.getDefaultInstance();
3039     } catch (IOException e) {
3040       throw new ServiceException(e);
3041     }
3042   }
3043 
3044   @Override
3045   public DeleteNamespaceResponse deleteNamespace(RpcController controller,
3046       DeleteNamespaceRequest request) throws ServiceException {
3047     try {
3048       deleteNamespace(request.getNamespaceName());
3049       return DeleteNamespaceResponse.getDefaultInstance();
3050     } catch (IOException e) {
3051       throw new ServiceException(e);
3052     }
3053   }
3054 
3055   @Override
3056   public GetNamespaceDescriptorResponse getNamespaceDescriptor(
3057       RpcController controller, GetNamespaceDescriptorRequest request)
3058       throws ServiceException {
3059     try {
3060       return GetNamespaceDescriptorResponse.newBuilder()
3061           .setNamespaceDescriptor(
3062               ProtobufUtil.toProtoNamespaceDescriptor(getNamespaceDescriptor(request.getNamespaceName())))
3063           .build();
3064     } catch (IOException e) {
3065       throw new ServiceException(e);
3066     }
3067   }
3068 
3069   @Override
3070   public ListNamespaceDescriptorsResponse listNamespaceDescriptors(
3071       RpcController controller, ListNamespaceDescriptorsRequest request)
3072       throws ServiceException {
3073     try {
3074       ListNamespaceDescriptorsResponse.Builder response =
3075           ListNamespaceDescriptorsResponse.newBuilder();
3076       for(NamespaceDescriptor ns: listNamespaceDescriptors()) {
3077         response.addNamespaceDescriptor(ProtobufUtil.toProtoNamespaceDescriptor(ns));
3078       }
3079       return response.build();
3080     } catch (IOException e) {
3081       throw new ServiceException(e);
3082     }
3083   }
3084 
3085   @Override
3086   public ListTableDescriptorsByNamespaceResponse listTableDescriptorsByNamespace(
3087       RpcController controller, ListTableDescriptorsByNamespaceRequest request)
3088       throws ServiceException {
3089     try {
3090       ListTableDescriptorsByNamespaceResponse.Builder b =
3091           ListTableDescriptorsByNamespaceResponse.newBuilder();
3092       for(HTableDescriptor htd: listTableDescriptorsByNamespace(request.getNamespaceName())) {
3093         b.addTableSchema(htd.convert());
3094       }
3095       return b.build();
3096     } catch (IOException e) {
3097       throw new ServiceException(e);
3098     }
3099   }
3100 
3101   @Override
3102   public ListTableNamesByNamespaceResponse listTableNamesByNamespace(
3103       RpcController controller, ListTableNamesByNamespaceRequest request)
3104       throws ServiceException {
3105     try {
3106       ListTableNamesByNamespaceResponse.Builder b =
3107           ListTableNamesByNamespaceResponse.newBuilder();
3108       for (TableName tableName: listTableNamesByNamespace(request.getNamespaceName())) {
3109         b.addTableName(ProtobufUtil.toProtoTableName(tableName));
3110       }
3111       return b.build();
3112     } catch (IOException e) {
3113       throw new ServiceException(e);
3114     }
3115   }
3116 
3117   private boolean isHealthCheckerConfigured() {
3118     String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
3119     return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
3120   }
3121 
3122   @Override
3123   public void createNamespace(NamespaceDescriptor descriptor) throws IOException {
3124     TableName.isLegalNamespaceName(Bytes.toBytes(descriptor.getName()));
3125     if (cpHost != null) {
3126       if (cpHost.preCreateNamespace(descriptor)) {
3127         return;
3128       }
3129     }
3130     LOG.info(getClientIdAuditPrefix() + " creating " + descriptor);
3131     tableNamespaceManager.create(descriptor);
3132     if (cpHost != null) {
3133       cpHost.postCreateNamespace(descriptor);
3134     }
3135   }
3136 
3137   @Override
3138   public void modifyNamespace(NamespaceDescriptor descriptor) throws IOException {
3139     TableName.isLegalNamespaceName(Bytes.toBytes(descriptor.getName()));
3140     if (cpHost != null) {
3141       if (cpHost.preModifyNamespace(descriptor)) {
3142         return;
3143       }
3144     }
3145     LOG.info(getClientIdAuditPrefix() + " modify " + descriptor);
3146     tableNamespaceManager.update(descriptor);
3147     if (cpHost != null) {
3148       cpHost.postModifyNamespace(descriptor);
3149     }
3150   }
3151 
3152   @Override
3153   public void deleteNamespace(String name) throws IOException {
3154     if (cpHost != null) {
3155       if (cpHost.preDeleteNamespace(name)) {
3156         return;
3157       }
3158     }
3159     LOG.info(getClientIdAuditPrefix() + " delete " + name);
3160     tableNamespaceManager.remove(name);
3161     if (cpHost != null) {
3162       cpHost.postDeleteNamespace(name);
3163     }
3164   }
3165 
3166   @Override
3167   public NamespaceDescriptor getNamespaceDescriptor(String name) throws IOException {
3168     boolean ready = tableNamespaceManager != null &&
3169         tableNamespaceManager.isTableAvailableAndInitialized();
3170     if (!ready) {
3171       throw new IOException("Table Namespace Manager not ready yet, try again later");
3172     }
3173     NamespaceDescriptor nsd = tableNamespaceManager.get(name);
3174     if (nsd == null) {
3175       throw new NamespaceNotFoundException(name);
3176     }
3177     return nsd;
3178   }
3179 
3180   @Override
3181   public List<NamespaceDescriptor> listNamespaceDescriptors() throws IOException {
3182     return Lists.newArrayList(tableNamespaceManager.list());
3183   }
3184 
3185   @Override
3186   public List<HTableDescriptor> listTableDescriptorsByNamespace(String name) throws IOException {
3187     getNamespaceDescriptor(name); // check that namespace exists
3188     return Lists.newArrayList(tableDescriptors.getByNamespace(name).values());
3189   }
3190 
3191   @Override
3192   public List<TableName> listTableNamesByNamespace(String name) throws IOException {
3193     List<TableName> tableNames = Lists.newArrayList();
3194     getNamespaceDescriptor(name); // check that namespace exists
3195     for (HTableDescriptor descriptor: tableDescriptors.getByNamespace(name).values()) {
3196       tableNames.add(descriptor.getTableName());
3197     }
3198     return tableNames;
3199   }
3200 
3201   @Override
3202   public ReportRegionTransitionResponse reportRegionTransition(RpcController controller,
3203       ReportRegionTransitionRequest req) throws ServiceException {
3204     try {
3205       RegionTransition rt = req.getTransition(0);
3206       TableName tableName = ProtobufUtil.toTableName(
3207         rt.getRegionInfo(0).getTableName());
3208       if (!TableName.META_TABLE_NAME.equals(tableName)
3209           && !assignmentManager.isFailoverCleanupDone()) {
3210         // Meta region is assigned before master finishes the
3211         // failover cleanup. So no need this check for it
3212         throw new PleaseHoldException("Master is rebuilding user regions");
3213       }
3214       ServerName sn = ProtobufUtil.toServerName(req.getServer());
3215       String error = assignmentManager.onRegionTransition(sn, rt);
3216       ReportRegionTransitionResponse.Builder rrtr =
3217         ReportRegionTransitionResponse.newBuilder();
3218       if (error != null) {
3219         rrtr.setErrorMessage(error);
3220       }
3221       return rrtr.build();
3222     } catch (IOException ioe) {
3223       throw new ServiceException(ioe);
3224     }
3225   }
3226 
3227 }