View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.lang.reflect.Constructor;
23  import java.lang.reflect.InvocationTargetException;
24  import java.net.InetAddress;
25  import java.net.InetSocketAddress;
26  import java.net.UnknownHostException;
27  import java.util.ArrayList;
28  import java.util.Collections;
29  import java.util.Comparator;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.concurrent.Callable;
34  import java.util.concurrent.ExecutionException;
35  import java.util.concurrent.Executors;
36  import java.util.concurrent.Future;
37  import java.util.concurrent.TimeUnit;
38  import java.util.concurrent.atomic.AtomicReference;
39  
40  import javax.management.ObjectName;
41  
42  import org.apache.commons.logging.Log;
43  import org.apache.commons.logging.LogFactory;
44  import org.apache.hadoop.classification.InterfaceAudience;
45  import org.apache.hadoop.conf.Configuration;
46  import org.apache.hadoop.fs.Path;
47  import org.apache.hadoop.hbase.Abortable;
48  import org.apache.hadoop.hbase.Chore;
49  import org.apache.hadoop.hbase.ClusterId;
50  import org.apache.hadoop.hbase.ClusterStatus;
51  import org.apache.hadoop.hbase.HColumnDescriptor;
52  import org.apache.hadoop.hbase.HConstants;
53  import org.apache.hadoop.hbase.HRegionInfo;
54  import org.apache.hadoop.hbase.HTableDescriptor;
55  import org.apache.hadoop.hbase.HealthCheckChore;
56  import org.apache.hadoop.hbase.MasterAdminProtocol;
57  import org.apache.hadoop.hbase.MasterMonitorProtocol;
58  import org.apache.hadoop.hbase.RegionServerStatusProtocol;
59  import org.apache.hadoop.hbase.Server;
60  import org.apache.hadoop.hbase.ServerLoad;
61  import org.apache.hadoop.hbase.ServerName;
62  import org.apache.hadoop.hbase.TableDescriptors;
63  import org.apache.hadoop.hbase.catalog.CatalogTracker;
64  import org.apache.hadoop.hbase.catalog.MetaReader;
65  import org.apache.hadoop.hbase.client.HConnectionManager;
66  import org.apache.hadoop.hbase.client.MetaScanner;
67  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
68  import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
69  import org.apache.hadoop.hbase.client.Result;
70  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
71  import org.apache.hadoop.hbase.exceptions.DeserializationException;
72  import org.apache.hadoop.hbase.exceptions.HBaseIOException;
73  import org.apache.hadoop.hbase.exceptions.MasterNotRunningException;
74  import org.apache.hadoop.hbase.exceptions.NotAllMetaRegionsOnlineException;
75  import org.apache.hadoop.hbase.exceptions.PleaseHoldException;
76  import org.apache.hadoop.hbase.exceptions.TableNotDisabledException;
77  import org.apache.hadoop.hbase.exceptions.TableNotFoundException;
78  import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
79  import org.apache.hadoop.hbase.exceptions.UnknownRegionException;
80  import org.apache.hadoop.hbase.executor.ExecutorService;
81  import org.apache.hadoop.hbase.executor.ExecutorType;
82  import org.apache.hadoop.hbase.ipc.HBaseServer;
83  import org.apache.hadoop.hbase.ipc.HBaseServerRPC;
84  import org.apache.hadoop.hbase.ipc.RpcServer;
85  import org.apache.hadoop.hbase.ipc.ServerRpcController;
86  import org.apache.hadoop.hbase.master.balancer.BalancerChore;
87  import org.apache.hadoop.hbase.master.balancer.ClusterStatusChore;
88  import org.apache.hadoop.hbase.master.balancer.LoadBalancerFactory;
89  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
90  import org.apache.hadoop.hbase.master.cleaner.LogCleaner;
91  import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
92  import org.apache.hadoop.hbase.master.handler.DeleteTableHandler;
93  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
94  import org.apache.hadoop.hbase.master.handler.DispatchMergingRegionHandler;
95  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
96  import org.apache.hadoop.hbase.master.handler.ModifyTableHandler;
97  import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler;
98  import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
99  import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
100 import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
101 import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
102 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
103 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
104 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
105 import org.apache.hadoop.hbase.protobuf.RequestConverter;
106 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
107 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
108 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
109 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
110 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier.RegionSpecifierType;
111 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
112 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.AddColumnRequest;
113 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.AddColumnResponse;
114 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.AssignRegionRequest;
115 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.AssignRegionResponse;
116 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.BalanceRequest;
117 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.BalanceResponse;
118 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.CatalogScanRequest;
119 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.CatalogScanResponse;
120 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.CreateTableRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.CreateTableResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DeleteColumnRequest;
123 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DeleteColumnResponse;
124 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DeleteSnapshotRequest;
125 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DeleteSnapshotResponse;
126 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DeleteTableRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DeleteTableResponse;
128 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DisableTableRequest;
129 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DisableTableResponse;
130 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DispatchMergingRegionsRequest;
131 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.DispatchMergingRegionsResponse;
132 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.EnableCatalogJanitorRequest;
133 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.EnableCatalogJanitorResponse;
134 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.EnableTableRequest;
135 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.EnableTableResponse;
136 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.IsCatalogJanitorEnabledRequest;
137 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.IsCatalogJanitorEnabledResponse;
138 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.IsRestoreSnapshotDoneRequest;
139 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.IsRestoreSnapshotDoneResponse;
140 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.IsSnapshotDoneRequest;
141 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.IsSnapshotDoneResponse;
142 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ListSnapshotRequest;
143 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ListSnapshotResponse;
144 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ModifyColumnRequest;
145 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ModifyColumnResponse;
146 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ModifyTableRequest;
147 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ModifyTableResponse;
148 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.MoveRegionRequest;
149 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.MoveRegionResponse;
150 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.OfflineRegionRequest;
151 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.OfflineRegionResponse;
152 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.RestoreSnapshotRequest;
153 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.RestoreSnapshotResponse;
154 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.SetBalancerRunningRequest;
155 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.SetBalancerRunningResponse;
156 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ShutdownRequest;
157 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.ShutdownResponse;
158 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.StopMasterRequest;
159 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.StopMasterResponse;
160 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.TakeSnapshotRequest;
161 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.TakeSnapshotResponse;
162 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.UnassignRegionRequest;
163 import org.apache.hadoop.hbase.protobuf.generated.MasterAdminProtos.UnassignRegionResponse;
164 import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos.GetClusterStatusRequest;
165 import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos.GetClusterStatusResponse;
166 import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos.GetSchemaAlterStatusRequest;
167 import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos.GetSchemaAlterStatusResponse;
168 import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos.GetTableDescriptorsRequest;
169 import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos.GetTableDescriptorsResponse;
170 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
171 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
172 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
173 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdResponse;
174 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
175 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportResponse;
176 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
177 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
178 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorRequest;
179 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorResponse;
180 import org.apache.hadoop.hbase.replication.regionserver.Replication;
181 import org.apache.hadoop.hbase.security.User;
182 import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
183 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
184 import org.apache.hadoop.hbase.trace.SpanReceiverHost;
185 import org.apache.hadoop.hbase.util.Bytes;
186 import org.apache.hadoop.hbase.util.CompressionTest;
187 import org.apache.hadoop.hbase.util.FSTableDescriptors;
188 import org.apache.hadoop.hbase.util.FSUtils;
189 import org.apache.hadoop.hbase.util.HFileArchiveUtil;
190 import org.apache.hadoop.hbase.util.HasThread;
191 import org.apache.hadoop.hbase.util.InfoServer;
192 import org.apache.hadoop.hbase.util.Pair;
193 import org.apache.hadoop.hbase.util.Sleeper;
194 import org.apache.hadoop.hbase.util.Strings;
195 import org.apache.hadoop.hbase.util.Threads;
196 import org.apache.hadoop.hbase.util.VersionInfo;
197 import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
198 import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker;
199 import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker;
200 import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
201 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
202 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
203 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
204 import org.apache.hadoop.metrics.util.MBeanUtil;
205 import org.apache.hadoop.net.DNS;
206 import org.apache.zookeeper.KeeperException;
207 import org.apache.zookeeper.Watcher;
208 
209 import com.google.common.collect.Maps;
210 import com.google.protobuf.Descriptors;
211 import com.google.protobuf.Message;
212 import com.google.protobuf.RpcCallback;
213 import com.google.protobuf.RpcController;
214 import com.google.protobuf.Service;
215 import com.google.protobuf.ServiceException;
216 
217 /**
218  * HMaster is the "master server" for HBase. An HBase cluster has one active
219  * master.  If many masters are started, all compete.  Whichever wins goes on to
220  * run the cluster.  All others park themselves in their constructor until
221  * master or cluster shutdown or until the active master loses its lease in
222  * zookeeper.  Thereafter, all running master jostle to take over master role.
223  *
224  * <p>The Master can be asked shutdown the cluster. See {@link #shutdown()}.  In
225  * this case it will tell all regionservers to go down and then wait on them
226  * all reporting in that they are down.  This master will then shut itself down.
227  *
228  * <p>You can also shutdown just this master.  Call {@link #stopMaster()}.
229  *
230  * @see MasterMonitorProtocol
231  * @see MasterAdminProtocol
232  * @see RegionServerStatusProtocol
233  * @see Watcher
234  */
235 @InterfaceAudience.Private
236 @SuppressWarnings("deprecation")
237 public class HMaster extends HasThread
238 implements MasterMonitorProtocol, MasterAdminProtocol, RegionServerStatusProtocol, MasterServices,
239 Server {
240   private static final Log LOG = LogFactory.getLog(HMaster.class.getName());
241 
242   // MASTER is name of the webapp and the attribute name used stuffing this
243   //instance into web context.
244   public static final String MASTER = "master";
245 
246   // The configuration for the Master
247   private final Configuration conf;
248   // server for the web ui
249   private InfoServer infoServer;
250 
251   // Our zk client.
252   private ZooKeeperWatcher zooKeeper;
253   // Manager and zk listener for master election
254   private ActiveMasterManager activeMasterManager;
255   // Region server tracker
256   private RegionServerTracker regionServerTracker;
257   // Draining region server tracker
258   private DrainingServerTracker drainingServerTracker;
259   // Tracker for load balancer state
260   private LoadBalancerTracker loadBalancerTracker;
261 
262   // RPC server for the HMaster
263   private final RpcServer rpcServer;
264   // Set after we've called HBaseServer#openServer and ready to receive RPCs.
265   // Set back to false after we stop rpcServer.  Used by tests.
266   private volatile boolean rpcServerOpen = false;
267 
268   /**
269    * This servers address.
270    */
271   private final InetSocketAddress isa;
272 
273   // Metrics for the HMaster
274   private final MetricsMaster metricsMaster;
275   // file system manager for the master FS operations
276   private MasterFileSystem fileSystemManager;
277 
278   // server manager to deal with region server info
279   private ServerManager serverManager;
280 
281   // manager of assignment nodes in zookeeper
282   AssignmentManager assignmentManager;
283   // manager of catalog regions
284   private CatalogTracker catalogTracker;
285   // Cluster status zk tracker and local setter
286   private ClusterStatusTracker clusterStatusTracker;
287 
288   // buffer for "fatal error" notices from region servers
289   // in the cluster. This is only used for assisting
290   // operations/debugging.
291   private MemoryBoundedLogMessageBuffer rsFatals;
292 
293   // This flag is for stopping this Master instance.  Its set when we are
294   // stopping or aborting
295   private volatile boolean stopped = false;
296   // Set on abort -- usually failure of our zk session.
297   private volatile boolean abort = false;
298   // flag set after we become the active master (used for testing)
299   private volatile boolean isActiveMaster = false;
300 
301   // flag set after we complete initialization once active,
302   // it is not private since it's used in unit tests
303   volatile boolean initialized = false;
304 
305   // flag set after we complete assignMeta.
306   private volatile boolean serverShutdownHandlerEnabled = false;
307 
308   // Instance of the hbase executor service.
309   ExecutorService executorService;
310 
311   private LoadBalancer balancer;
312   private Thread balancerChore;
313   private Thread clusterStatusChore;
314   private ClusterStatusPublisher clusterStatusPublisherChore = null;
315 
316   private CatalogJanitor catalogJanitorChore;
317   private LogCleaner logCleaner;
318   private HFileCleaner hfileCleaner;
319 
320   private MasterCoprocessorHost cpHost;
321   private final ServerName serverName;
322 
323   private TableDescriptors tableDescriptors;
324 
325   // Table level lock manager for schema changes
326   private TableLockManager tableLockManager;
327 
328   // Time stamps for when a hmaster was started and when it became active
329   private long masterStartTime;
330   private long masterActiveTime;
331 
332   /** time interval for emitting metrics values */
333   private final int msgInterval;
334   /**
335    * MX Bean for MasterInfo
336    */
337   private ObjectName mxBean = null;
338 
339   //should we check the compression codec type at master side, default true, HBASE-6370
340   private final boolean masterCheckCompression;
341 
342   private SpanReceiverHost spanReceiverHost;
343 
344   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
345 
346   // monitor for snapshot of hbase tables
347   private SnapshotManager snapshotManager;
348 
349   /** The health check chore. */
350   private HealthCheckChore healthCheckChore;
351 
352 
353   /**
354    * Initializes the HMaster. The steps are as follows:
355    * <p>
356    * <ol>
357    * <li>Initialize HMaster RPC and address
358    * <li>Connect to ZooKeeper.
359    * </ol>
360    * <p>
361    * Remaining steps of initialization occur in {@link #run()} so that they
362    * run in their own thread rather than within the context of the constructor.
363    * @throws InterruptedException
364    */
365   public HMaster(final Configuration conf)
366   throws IOException, KeeperException, InterruptedException {
367     this.conf = new Configuration(conf);
368     // Disable the block cache on the master
369     this.conf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
370     // Set how many times to retry talking to another server over HConnection.
371     HConnectionManager.setServerSideHConnectionRetries(this.conf, LOG);
372     // Server to handle client requests.
373     String hostname = Strings.domainNamePointerToHostName(DNS.getDefaultHost(
374       conf.get("hbase.master.dns.interface", "default"),
375       conf.get("hbase.master.dns.nameserver", "default")));
376     int port = conf.getInt(HConstants.MASTER_PORT, HConstants.DEFAULT_MASTER_PORT);
377     // Test that the hostname is reachable
378     InetSocketAddress initialIsa = new InetSocketAddress(hostname, port);
379     if (initialIsa.getAddress() == null) {
380       throw new IllegalArgumentException("Failed resolve of hostname " + initialIsa);
381     }
382     // Verify that the bind address is reachable if set
383     String bindAddress = conf.get("hbase.master.ipc.address");
384     if (bindAddress != null) {
385       initialIsa = new InetSocketAddress(bindAddress, port);
386       if (initialIsa.getAddress() == null) {
387         throw new IllegalArgumentException("Failed resolve of bind address " + initialIsa);
388       }
389     }
390     int numHandlers = conf.getInt("hbase.master.handler.count",
391       conf.getInt("hbase.regionserver.handler.count", 25));
392     this.rpcServer = HBaseServerRPC.getServer(MasterMonitorProtocol.class, this,
393         new Class<?>[]{MasterMonitorProtocol.class,
394             MasterAdminProtocol.class, RegionServerStatusProtocol.class},
395         initialIsa.getHostName(), // This is bindAddress if set else it's hostname
396         initialIsa.getPort(),
397         numHandlers,
398         0, // we dont use high priority handlers in master
399         conf.getBoolean("hbase.rpc.verbose", false), conf,
400         0); // this is a DNC w/o high priority handlers
401     // Set our address.
402     this.isa = this.rpcServer.getListenerAddress();
403     this.serverName = new ServerName(hostname,
404       this.isa.getPort(), System.currentTimeMillis());
405     this.rsFatals = new MemoryBoundedLogMessageBuffer(
406         conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));
407 
408     // login the zookeeper client principal (if using security)
409     ZKUtil.loginClient(this.conf, "hbase.zookeeper.client.keytab.file",
410       "hbase.zookeeper.client.kerberos.principal", this.isa.getHostName());
411 
412     // initialize server principal (if using secure Hadoop)
413     User.login(conf, "hbase.master.keytab.file",
414       "hbase.master.kerberos.principal", this.isa.getHostName());
415 
416     LOG.info("hbase.rootdir=" + FSUtils.getRootDir(this.conf) +
417         ", hbase.cluster.distributed=" + this.conf.getBoolean("hbase.cluster.distributed", false));
418 
419     // set the thread name now we have an address
420     setName(MASTER + "-" + this.serverName.toString());
421 
422     Replication.decorateMasterConfiguration(this.conf);
423 
424     // Hack! Maps DFSClient => Master for logs.  HDFS made this
425     // config param for task trackers, but we can piggyback off of it.
426     if (this.conf.get("mapred.task.id") == null) {
427       this.conf.set("mapred.task.id", "hb_m_" + this.serverName.toString());
428     }
429 
430     this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true);
431     this.rpcServer.startThreads();
432 
433     // metrics interval: using the same property as region server.
434     this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
435 
436     //should we check the compression codec type at master side, default true, HBASE-6370
437     this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
438 
439     this.metricsMaster = new MetricsMaster( new MetricsMasterWrapperImpl(this));
440 
441     // Health checker thread.
442     int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
443       HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
444     if (isHealthCheckerConfigured()) {
445       healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
446     }
447 
448     // Do we publish the status?
449     Class<? extends ClusterStatusPublisher.Publisher> publisherClass =
450         conf.getClass(ClusterStatusPublisher.STATUS_PUBLISHER_CLASS,
451             ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS,
452             ClusterStatusPublisher.Publisher.class);
453 
454     if (publisherClass != null) {
455       clusterStatusPublisherChore = new ClusterStatusPublisher(this, conf, publisherClass);
456       Threads.setDaemonThreadRunning(clusterStatusPublisherChore.getThread());
457     }
458   }
459 
460   /**
461    * Stall startup if we are designated a backup master; i.e. we want someone
462    * else to become the master before proceeding.
463    * @param c configuration
464    * @param amm
465    * @throws InterruptedException
466    */
467   private static void stallIfBackupMaster(final Configuration c,
468       final ActiveMasterManager amm)
469   throws InterruptedException {
470     // If we're a backup master, stall until a primary to writes his address
471     if (!c.getBoolean(HConstants.MASTER_TYPE_BACKUP,
472       HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
473       return;
474     }
475     LOG.debug("HMaster started in backup mode.  " +
476       "Stalling until master znode is written.");
477     // This will only be a minute or so while the cluster starts up,
478     // so don't worry about setting watches on the parent znode
479     while (!amm.isActiveMaster()) {
480       LOG.debug("Waiting for master address ZNode to be written " +
481         "(Also watching cluster state node)");
482       Thread.sleep(
483         c.getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT));
484     }
485 
486   }
487 
488   MetricsMaster getMetrics() {
489     return metricsMaster;
490   }
491 
492   /**
493    * Main processing loop for the HMaster.
494    * <ol>
495    * <li>Block until becoming active master
496    * <li>Finish initialization via finishInitialization(MonitoredTask)
497    * <li>Enter loop until we are stopped
498    * <li>Stop services and perform cleanup once stopped
499    * </ol>
500    */
501   @Override
502   public void run() {
503     MonitoredTask startupStatus =
504       TaskMonitor.get().createStatus("Master startup");
505     startupStatus.setDescription("Master startup");
506     masterStartTime = System.currentTimeMillis();
507     try {
508       /*
509        * Block on becoming the active master.
510        *
511        * We race with other masters to write our address into ZooKeeper.  If we
512        * succeed, we are the primary/active master and finish initialization.
513        *
514        * If we do not succeed, there is another active master and we should
515        * now wait until it dies to try and become the next active master.  If we
516        * do not succeed on our first attempt, this is no longer a cluster startup.
517        */
518       becomeActiveMaster(startupStatus);
519 
520       // We are either the active master or we were asked to shutdown
521       if (!this.stopped) {
522         finishInitialization(startupStatus, false);
523         loop();
524       }
525     } catch (Throwable t) {
526       // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
527       if (t instanceof NoClassDefFoundError &&
528           t.getMessage().contains("org/apache/hadoop/hdfs/protocol/FSConstants$SafeModeAction")) {
529           // improved error message for this special case
530           abort("HBase is having a problem with its Hadoop jars.  You may need to "
531               + "recompile HBase against Hadoop version "
532               +  org.apache.hadoop.util.VersionInfo.getVersion()
533               + " or change your hadoop jars to start properly", t);
534       } else {
535         abort("Unhandled exception. Starting shutdown.", t);
536       }
537     } finally {
538       startupStatus.cleanup();
539 
540       stopChores();
541       // Wait for all the remaining region servers to report in IFF we were
542       // running a cluster shutdown AND we were NOT aborting.
543       if (!this.abort && this.serverManager != null &&
544           this.serverManager.isClusterShutdown()) {
545         this.serverManager.letRegionServersShutdown();
546       }
547       stopServiceThreads();
548       // Stop services started for both backup and active masters
549       if (this.activeMasterManager != null) this.activeMasterManager.stop();
550       if (this.catalogTracker != null) this.catalogTracker.stop();
551       if (this.serverManager != null) this.serverManager.stop();
552       if (this.assignmentManager != null) this.assignmentManager.stop();
553       if (this.fileSystemManager != null) this.fileSystemManager.stop();
554       if (this.snapshotManager != null) this.snapshotManager.stop("server shutting down.");
555       this.zooKeeper.close();
556     }
557     LOG.info("HMaster main thread exiting");
558   }
559 
560   /**
561    * Try becoming active master.
562    * @param startupStatus
563    * @return True if we could successfully become the active master.
564    * @throws InterruptedException
565    */
566   private boolean becomeActiveMaster(MonitoredTask startupStatus)
567   throws InterruptedException {
568     // TODO: This is wrong!!!! Should have new servername if we restart ourselves,
569     // if we come back to life.
570     this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName,
571         this);
572     this.zooKeeper.registerListener(activeMasterManager);
573     stallIfBackupMaster(this.conf, this.activeMasterManager);
574 
575     // The ClusterStatusTracker is setup before the other
576     // ZKBasedSystemTrackers because it's needed by the activeMasterManager
577     // to check if the cluster should be shutdown.
578     this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
579     this.clusterStatusTracker.start();
580     return this.activeMasterManager.blockUntilBecomingActiveMaster(startupStatus,
581         this.clusterStatusTracker);
582   }
583 
584   /**
585    * Initialize all ZK based system trackers.
586    * @throws IOException
587    * @throws InterruptedException
588    */
589   private void initializeZKBasedSystemTrackers() throws IOException,
590       InterruptedException, KeeperException {
591     this.catalogTracker = createCatalogTracker(this.zooKeeper, this.conf, this);
592     this.catalogTracker.start();
593 
594     this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
595     this.loadBalancerTracker = new LoadBalancerTracker(zooKeeper, this);
596     this.loadBalancerTracker.start();
597     this.assignmentManager = new AssignmentManager(this, serverManager,
598       this.catalogTracker, this.balancer, this.executorService, this.metricsMaster,
599       this.tableLockManager);
600     zooKeeper.registerListenerFirst(assignmentManager);
601 
602     this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
603         this.serverManager);
604     this.regionServerTracker.start();
605 
606     this.drainingServerTracker = new DrainingServerTracker(zooKeeper, this,
607       this.serverManager);
608     this.drainingServerTracker.start();
609 
610     // Set the cluster as up.  If new RSs, they'll be waiting on this before
611     // going ahead with their startup.
612     boolean wasUp = this.clusterStatusTracker.isClusterUp();
613     if (!wasUp) this.clusterStatusTracker.setClusterUp();
614 
615     LOG.info("Server active/primary master; " + this.serverName +
616         ", sessionid=0x" +
617         Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()) +
618         ", cluster-up flag was=" + wasUp);
619 
620     // create the snapshot manager
621     this.snapshotManager = new SnapshotManager(this);
622   }
623 
624   /**
625    * Create CatalogTracker.
626    * In its own method so can intercept and mock it over in tests.
627    * @param zk If zk is null, we'll create an instance (and shut it down
628    * when {@link #stop(String)} is called) else we'll use what is passed.
629    * @param conf
630    * @param abortable If fatal exception we'll call abort on this.  May be null.
631    * If it is we'll use the Connection associated with the passed
632    * {@link Configuration} as our {@link Abortable}.
633    * ({@link Object#wait(long)} when passed a <code>0</code> waits for ever).
634    * @throws IOException
635    */
636   CatalogTracker createCatalogTracker(final ZooKeeperWatcher zk,
637       final Configuration conf, Abortable abortable)
638   throws IOException {
639     return new CatalogTracker(zk, conf, abortable);
640   }
641 
642   // Check if we should stop every 100ms
643   private Sleeper stopSleeper = new Sleeper(100, this);
644 
645   private void loop() {
646     long lastMsgTs = 0l;
647     long now = 0l;
648     while (!this.stopped) {
649       now = System.currentTimeMillis();
650       if ((now - lastMsgTs) >= this.msgInterval) {
651         doMetrics();
652         lastMsgTs = System.currentTimeMillis();
653       }
654       stopSleeper.sleep();
655     }
656   }
657 
658   /**
659    * Emit the HMaster metrics, such as region in transition metrics.
660    * Surrounding in a try block just to be sure metrics doesn't abort HMaster.
661    */
662   private void doMetrics() {
663     try {
664       this.assignmentManager.updateRegionsInTransitionMetrics();
665     } catch (Throwable e) {
666       LOG.error("Couldn't update metrics: " + e.getMessage());
667     }
668   }
669 
670   /**
671    * Finish initialization of HMaster after becoming the primary master.
672    *
673    * <ol>
674    * <li>Initialize master components - file system manager, server manager,
675    *     assignment manager, region server tracker, catalog tracker, etc</li>
676    * <li>Start necessary service threads - rpc server, info server,
677    *     executor services, etc</li>
678    * <li>Set cluster as UP in ZooKeeper</li>
679    * <li>Wait for RegionServers to check-in</li>
680    * <li>Split logs and perform data recovery, if necessary</li>
681    * <li>Ensure assignment of meta regions<li>
682    * <li>Handle either fresh cluster start or master failover</li>
683    * </ol>
684    *
685    * @param masterRecovery
686    *
687    * @throws IOException
688    * @throws InterruptedException
689    * @throws KeeperException
690    */
691   private void finishInitialization(MonitoredTask status, boolean masterRecovery)
692   throws IOException, InterruptedException, KeeperException {
693 
694     isActiveMaster = true;
695 
696     /*
697      * We are active master now... go initialize components we need to run.
698      * Note, there may be dross in zk from previous runs; it'll get addressed
699      * below after we determine if cluster startup or failover.
700      */
701 
702     status.setStatus("Initializing Master file system");
703     this.masterActiveTime = System.currentTimeMillis();
704     // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
705     this.fileSystemManager = new MasterFileSystem(this, this, metricsMaster, masterRecovery);
706 
707     this.tableDescriptors =
708       new FSTableDescriptors(this.fileSystemManager.getFileSystem(),
709       this.fileSystemManager.getRootDir());
710 
711     // publish cluster ID
712     status.setStatus("Publishing Cluster ID in ZooKeeper");
713     ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
714 
715     if (!masterRecovery) {
716       this.executorService = new ExecutorService(getServerName().toString());
717       this.serverManager = createServerManager(this, this);
718     }
719 
720     //Initialize table lock manager, and ensure that all write locks held previously
721     //are invalidated
722     this.tableLockManager = TableLockManager.createTableLockManager(conf, zooKeeper, serverName);
723     if (!masterRecovery) {
724       this.tableLockManager.reapAllTableWriteLocks();
725     }
726 
727     status.setStatus("Initializing ZK system trackers");
728     initializeZKBasedSystemTrackers();
729 
730     if (!masterRecovery) {
731       // initialize master side coprocessors before we start handling requests
732       status.setStatus("Initializing master coprocessors");
733       this.cpHost = new MasterCoprocessorHost(this, this.conf);
734 
735       spanReceiverHost = new SpanReceiverHost(getConfiguration());
736       spanReceiverHost.loadSpanReceivers();
737 
738       // start up all service threads.
739       status.setStatus("Initializing master service threads");
740       startServiceThreads();
741     }
742 
743     // Wait for region servers to report in.
744     this.serverManager.waitForRegionServers(status);
745     // Check zk for region servers that are up but didn't register
746     for (ServerName sn: this.regionServerTracker.getOnlineServers()) {
747       if (!this.serverManager.isServerOnline(sn)) {
748         // Not registered; add it.
749         LOG.info("Registering server found up in zk but who has not yet " +
750           "reported in: " + sn);
751         this.serverManager.recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD);
752       }
753     }
754 
755     if (!masterRecovery) {
756       this.assignmentManager.startTimeOutMonitor();
757     }
758 
759     // TODO: Should do this in background rather than block master startup
760     status.setStatus("Splitting logs after master startup");
761     splitLogAfterStartup(this.fileSystemManager);
762 
763     // Make sure meta assigned before proceeding.
764     if (!assignMeta(status)) return;
765     enableServerShutdownHandler();
766 
767     // Update meta with new PB serialization if required. i.e migrate all HRI
768     // to PB serialization in meta and update the status in ROOT. This must happen
769     // before we assign all user regions or else the assignment will fail.
770     // TODO: Remove this after 0.96, when we do 0.98.
771     org.apache.hadoop.hbase.catalog.MetaMigrationConvertingToPB
772       .updateRootAndMetaIfNecessary(this);
773 
774     this.balancer.setMasterServices(this);
775     // Fix up assignment manager status
776     status.setStatus("Starting assignment manager");
777     this.assignmentManager.joinCluster();
778 
779     this.balancer.setClusterStatus(getClusterStatus());
780 
781     if (!masterRecovery) {
782       // Start balancer and meta catalog janitor after meta and regions have
783       // been assigned.
784       status.setStatus("Starting balancer and catalog janitor");
785       this.clusterStatusChore = getAndStartClusterStatusChore(this);
786       this.balancerChore = getAndStartBalancerChore(this);
787       this.catalogJanitorChore = new CatalogJanitor(this, this);
788       startCatalogJanitorChore();
789     }
790 
791     status.markComplete("Initialization successful");
792     LOG.info("Master has completed initialization");
793     initialized = true;
794     // clear the dead servers with same host name and port of online server because we are not
795     // removing dead server with same hostname and port of rs which is trying to check in before
796     // master initialization. See HBASE-5916.
797     this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
798 
799     if (!masterRecovery) {
800       if (this.cpHost != null) {
801         // don't let cp initialization errors kill the master
802         try {
803           this.cpHost.postStartMaster();
804         } catch (IOException ioe) {
805           LOG.error("Coprocessor postStartMaster() hook failed", ioe);
806         }
807       }
808     }
809   }
810 
811   /**
812    * Useful for testing purpose also where we have
813    * master restart scenarios.
814    */
815   protected void startCatalogJanitorChore() {
816     Threads.setDaemonThreadRunning(catalogJanitorChore.getThread());
817   }
818 
819   /**
820    * Override to change master's splitLogAfterStartup. Used testing
821    * @param mfs
822    */
823   protected void splitLogAfterStartup(final MasterFileSystem mfs) {
824     mfs.splitLogAfterStartup();
825   }
826 
827   /**
828    * Create a {@link ServerManager} instance.
829    * @param master
830    * @param services
831    * @return An instance of {@link ServerManager}
832    * @throws org.apache.hadoop.hbase.exceptions.ZooKeeperConnectionException
833    * @throws IOException
834    */
835   ServerManager createServerManager(final Server master,
836       final MasterServices services)
837   throws IOException {
838     // We put this out here in a method so can do a Mockito.spy and stub it out
839     // w/ a mocked up ServerManager.
840     return new ServerManager(master, services);
841   }
842 
843   /**
844    * If ServerShutdownHandler is disabled, we enable it and expire those dead
845    * but not expired servers.
846    */
847   private void enableServerShutdownHandler() {
848     if (!serverShutdownHandlerEnabled) {
849       serverShutdownHandlerEnabled = true;
850       this.serverManager.processQueuedDeadServers();
851     }
852   }
853 
854   /**
855    * Check <code>.META.</code> are assigned.  If not,
856    * assign them.
857    * @throws InterruptedException
858    * @throws IOException
859    * @throws KeeperException
860    * @return True if meta is healthy, assigned
861    */
862   boolean assignMeta(MonitoredTask status)
863   throws InterruptedException, IOException, KeeperException {
864     int assigned = 0;
865     long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 1000);
866 
867     // Work on .META. region.  Is it in zk in transition?
868     status.setStatus("Assigning META region");
869     assignmentManager.getRegionStates().createRegionState(
870         HRegionInfo.FIRST_META_REGIONINFO);
871     boolean rit = this.assignmentManager.
872       processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO);
873     ServerName currentMetaServer = null;
874     boolean metaRegionLocation = catalogTracker.verifyMetaRegionLocation(timeout);
875     if (!rit && !metaRegionLocation) {
876       currentMetaServer = this.catalogTracker.getMetaLocation();
877       splitLogAndExpireIfOnline(currentMetaServer);
878       this.assignmentManager.assignMeta();
879       enableSSHandWaitForMeta();
880       // Make sure a .META. location is set.
881       if (!isMetaLocation()) return false;
882       // This guarantees that the transition assigning .META. has completed
883       this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
884       assigned++;
885     } else if (rit && !metaRegionLocation) {
886       // Make sure a .META. location is set.
887       if (!isMetaLocation()) return false;
888       // This guarantees that the transition assigning .META. has completed
889       this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
890       assigned++;
891     } else if (metaRegionLocation) {
892       // Region already assigned.  We didn't assign it.  Add to in-memory state.
893       this.assignmentManager.regionOnline(HRegionInfo.FIRST_META_REGIONINFO,
894         this.catalogTracker.getMetaLocation());
895     }
896     enableCatalogTables(Bytes.toString(HConstants.META_TABLE_NAME));
897     LOG.info(".META. assigned=" + assigned + ", rit=" + rit +
898       ", location=" + catalogTracker.getMetaLocation());
899     status.setStatus("META assigned.");
900     return true;
901   }
902 
903   private void enableSSHandWaitForMeta() throws IOException, InterruptedException {
904     enableServerShutdownHandler();
905     this.catalogTracker.waitForMeta();
906     // Above check waits for general meta availability but this does not
907     // guarantee that the transition has completed
908     this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
909   }
910 
911   /**
912    * @return True if there a meta available
913    * @throws InterruptedException
914    */
915   private boolean isMetaLocation() throws InterruptedException {
916     // Cycle up here in master rather than down in catalogtracker so we can
917     // check the master stopped flag every so often.
918     while (!this.stopped) {
919       try {
920         if (this.catalogTracker.waitForMeta(100) != null) break;
921       } catch (NotAllMetaRegionsOnlineException e) {
922         // Ignore.  I know .META. is not online yet.
923       }
924     }
925     // We got here because we came of above loop.
926     return !this.stopped;
927   }
928 
929   private void enableCatalogTables(String catalogTableName) {
930     if (!this.assignmentManager.getZKTable().isEnabledTable(catalogTableName)) {
931       this.assignmentManager.setEnabledTable(catalogTableName);
932     }
933   }
934 
935   /**
936    * Split a server's log and expire it if we find it is one of the online
937    * servers.
938    * @param sn ServerName to check.
939    * @throws IOException
940    */
941   private void splitLogAndExpireIfOnline(final ServerName sn)
942       throws IOException {
943     if (sn == null || !serverManager.isServerOnline(sn)) {
944       return;
945     }
946     LOG.info("Forcing splitLog and expire of " + sn);
947     fileSystemManager.splitMetaLog(sn);
948     fileSystemManager.splitLog(sn);
949     serverManager.expireServer(sn);
950   }
951 
952   @Override
953   public TableDescriptors getTableDescriptors() {
954     return this.tableDescriptors;
955   }
956 
957   /** @return InfoServer object. Maybe null.*/
958   public InfoServer getInfoServer() {
959     return this.infoServer;
960   }
961 
962   @Override
963   public Configuration getConfiguration() {
964     return this.conf;
965   }
966 
967   @Override
968   public ServerManager getServerManager() {
969     return this.serverManager;
970   }
971 
972   @Override
973   public ExecutorService getExecutorService() {
974     return this.executorService;
975   }
976 
977   @Override
978   public MasterFileSystem getMasterFileSystem() {
979     return this.fileSystemManager;
980   }
981 
982   /**
983    * Get the ZK wrapper object - needed by master_jsp.java
984    * @return the zookeeper wrapper
985    */
986   public ZooKeeperWatcher getZooKeeperWatcher() {
987     return this.zooKeeper;
988   }
989 
990   /*
991    * Start up all services. If any of these threads gets an unhandled exception
992    * then they just die with a logged message.  This should be fine because
993    * in general, we do not expect the master to get such unhandled exceptions
994    *  as OOMEs; it should be lightly loaded. See what HRegionServer does if
995    *  need to install an unexpected exception handler.
996    */
997   void startServiceThreads() throws IOException{
998 
999    // Start the executor service pools
1000    this.executorService.startExecutorService(ExecutorType.MASTER_OPEN_REGION,
1001       conf.getInt("hbase.master.executor.openregion.threads", 5));
1002    this.executorService.startExecutorService(ExecutorType.MASTER_CLOSE_REGION,
1003       conf.getInt("hbase.master.executor.closeregion.threads", 5));
1004    this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
1005       conf.getInt("hbase.master.executor.serverops.threads", 3));
1006    this.executorService.startExecutorService(ExecutorType.MASTER_META_SERVER_OPERATIONS,
1007       conf.getInt("hbase.master.executor.serverops.threads", 5));
1008 
1009    // We depend on there being only one instance of this executor running
1010    // at a time.  To do concurrency, would need fencing of enable/disable of
1011    // tables.
1012    this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS, 1);
1013 
1014    // Start log cleaner thread
1015    String n = Thread.currentThread().getName();
1016    int cleanerInterval = conf.getInt("hbase.master.cleaner.interval", 60 * 1000);
1017    this.logCleaner =
1018       new LogCleaner(cleanerInterval,
1019          this, conf, getMasterFileSystem().getFileSystem(),
1020          getMasterFileSystem().getOldLogDir());
1021          Threads.setDaemonThreadRunning(logCleaner.getThread(), n + ".oldLogCleaner");
1022 
1023    //start the hfile archive cleaner thread
1024     Path archiveDir = HFileArchiveUtil.getArchivePath(conf);
1025     this.hfileCleaner = new HFileCleaner(cleanerInterval, this, conf, getMasterFileSystem()
1026         .getFileSystem(), archiveDir);
1027     Threads.setDaemonThreadRunning(hfileCleaner.getThread(), n + ".archivedHFileCleaner");
1028 
1029    // Put up info server.
1030    int port = this.conf.getInt(HConstants.MASTER_INFO_PORT, 60010);
1031    if (port >= 0) {
1032      String a = this.conf.get("hbase.master.info.bindAddress", "0.0.0.0");
1033      this.infoServer = new InfoServer(MASTER, a, port, false, this.conf);
1034      this.infoServer.addServlet("status", "/master-status", MasterStatusServlet.class);
1035      this.infoServer.addServlet("dump", "/dump", MasterDumpServlet.class);
1036      this.infoServer.setAttribute(MASTER, this);
1037      this.infoServer.start();
1038     }
1039 
1040    // Start the health checker
1041    if (this.healthCheckChore != null) {
1042      Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker");
1043    }
1044 
1045     // Start allowing requests to happen.
1046     this.rpcServer.openServer();
1047     this.rpcServerOpen = true;
1048     if (LOG.isDebugEnabled()) {
1049       LOG.debug("Started service threads");
1050     }
1051   }
1052 
1053   /**
1054    * Use this when trying to figure when its ok to send in rpcs.  Used by tests.
1055    * @return True if we have successfully run {@link HBaseServer#openServer()}
1056    */
1057   boolean isRpcServerOpen() {
1058     return this.rpcServerOpen;
1059   }
1060 
1061   private void stopServiceThreads() {
1062     if (LOG.isDebugEnabled()) {
1063       LOG.debug("Stopping service threads");
1064     }
1065     if (this.rpcServer != null) this.rpcServer.stop();
1066     this.rpcServerOpen = false;
1067     // Clean up and close up shop
1068     if (this.logCleaner!= null) this.logCleaner.interrupt();
1069     if (this.hfileCleaner != null) this.hfileCleaner.interrupt();
1070 
1071     if (this.infoServer != null) {
1072       LOG.info("Stopping infoServer");
1073       try {
1074         this.infoServer.stop();
1075       } catch (Exception ex) {
1076         ex.printStackTrace();
1077       }
1078     }
1079     if (this.executorService != null) this.executorService.shutdown();
1080     if (this.healthCheckChore != null) {
1081       this.healthCheckChore.interrupt();
1082     }
1083   }
1084 
1085   private static Thread getAndStartClusterStatusChore(HMaster master) {
1086     if (master == null || master.balancer == null) {
1087       return null;
1088     }
1089     Chore chore = new ClusterStatusChore(master, master.balancer);
1090     return Threads.setDaemonThreadRunning(chore.getThread());
1091   }
1092 
1093   private static Thread getAndStartBalancerChore(final HMaster master) {
1094     // Start up the load balancer chore
1095     Chore chore = new BalancerChore(master);
1096     return Threads.setDaemonThreadRunning(chore.getThread());
1097   }
1098 
1099   private void stopChores() {
1100     if (this.balancerChore != null) {
1101       this.balancerChore.interrupt();
1102     }
1103     if (this.clusterStatusChore != null) {
1104       this.clusterStatusChore.interrupt();
1105     }
1106     if (this.catalogJanitorChore != null) {
1107       this.catalogJanitorChore.interrupt();
1108     }
1109     if (this.clusterStatusPublisherChore != null){
1110       clusterStatusPublisherChore.interrupt();
1111     }
1112   }
1113 
1114   @Override
1115   public RegionServerStartupResponse regionServerStartup(
1116       RpcController controller, RegionServerStartupRequest request) throws ServiceException {
1117     // Register with server manager
1118     try {
1119       InetAddress ia = getRemoteInetAddress(request.getPort(), request.getServerStartCode());
1120       ServerName rs = this.serverManager.regionServerStartup(ia, request.getPort(),
1121         request.getServerStartCode(), request.getServerCurrentTime());
1122 
1123       // Send back some config info
1124       RegionServerStartupResponse.Builder resp = createConfigurationSubset();
1125       NameStringPair.Builder entry = NameStringPair.newBuilder()
1126         .setName(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)
1127         .setValue(rs.getHostname());
1128       resp.addMapEntries(entry.build());
1129 
1130       return resp.build();
1131     } catch (IOException ioe) {
1132       throw new ServiceException(ioe);
1133     }
1134   }
1135 
1136   /**
1137    * @return Get remote side's InetAddress
1138    * @throws UnknownHostException
1139    */
1140   InetAddress getRemoteInetAddress(final int port, final long serverStartCode)
1141   throws UnknownHostException {
1142     // Do it out here in its own little method so can fake an address when
1143     // mocking up in tests.
1144     return HBaseServer.getRemoteIp();
1145   }
1146 
1147   /**
1148    * @return Subset of configuration to pass initializing regionservers: e.g.
1149    * the filesystem to use and root directory to use.
1150    */
1151   protected RegionServerStartupResponse.Builder createConfigurationSubset() {
1152     RegionServerStartupResponse.Builder resp = addConfig(
1153       RegionServerStartupResponse.newBuilder(), HConstants.HBASE_DIR);
1154     return addConfig(resp, "fs.default.name");
1155   }
1156 
1157   private RegionServerStartupResponse.Builder addConfig(
1158       final RegionServerStartupResponse.Builder resp, final String key) {
1159     NameStringPair.Builder entry = NameStringPair.newBuilder()
1160       .setName(key)
1161       .setValue(this.conf.get(key));
1162     resp.addMapEntries(entry.build());
1163     return resp;
1164   }
1165 
1166   @Override
1167   public GetLastFlushedSequenceIdResponse getLastFlushedSequenceId(RpcController controller,
1168       GetLastFlushedSequenceIdRequest request) throws ServiceException {
1169     byte[] regionName = request.getRegionName().toByteArray();
1170     long seqId = serverManager.getLastFlushedSequenceId(regionName);
1171     return ResponseConverter.buildGetLastFlushedSequenceIdResponse(seqId);
1172   }
1173 
1174   @Override
1175   public RegionServerReportResponse regionServerReport(
1176       RpcController controller, RegionServerReportRequest request) throws ServiceException {
1177     try {
1178       HBaseProtos.ServerLoad sl = request.getLoad();
1179       this.serverManager.regionServerReport(ProtobufUtil.toServerName(request.getServer()), new ServerLoad(sl));
1180       if (sl != null && this.metricsMaster != null) {
1181         // Up our metrics.
1182         this.metricsMaster.incrementRequests(sl.getTotalNumberOfRequests());
1183       }
1184     } catch (IOException ioe) {
1185       throw new ServiceException(ioe);
1186     }
1187 
1188     return RegionServerReportResponse.newBuilder().build();
1189   }
1190 
1191   @Override
1192   public ReportRSFatalErrorResponse reportRSFatalError(
1193       RpcController controller, ReportRSFatalErrorRequest request) throws ServiceException {
1194     String errorText = request.getErrorMessage();
1195     ServerName sn = ProtobufUtil.toServerName(request.getServer());
1196     String msg = "Region server " + Bytes.toString(sn.getVersionedBytes()) +
1197       " reported a fatal error:\n" + errorText;
1198     LOG.error(msg);
1199     rsFatals.add(msg);
1200 
1201     return ReportRSFatalErrorResponse.newBuilder().build();
1202   }
1203 
1204   public boolean isMasterRunning() {
1205     return !isStopped();
1206   }
1207 
1208   public IsMasterRunningResponse isMasterRunning(RpcController c, IsMasterRunningRequest req)
1209   throws ServiceException {
1210     return IsMasterRunningResponse.newBuilder().setIsMasterRunning(isMasterRunning()).build();
1211   }
1212 
1213   @Override
1214   public CatalogScanResponse runCatalogScan(RpcController c,
1215       CatalogScanRequest req) throws ServiceException {
1216     try {
1217       return ResponseConverter.buildCatalogScanResponse(catalogJanitorChore.scan());
1218     } catch (IOException ioe) {
1219       throw new ServiceException(ioe);
1220     }
1221   }
1222 
1223   @Override
1224   public EnableCatalogJanitorResponse enableCatalogJanitor(RpcController c,
1225       EnableCatalogJanitorRequest req) throws ServiceException {
1226     return EnableCatalogJanitorResponse.newBuilder().
1227         setPrevValue(catalogJanitorChore.setEnabled(req.getEnable())).build();
1228   }
1229 
1230   @Override
1231   public IsCatalogJanitorEnabledResponse isCatalogJanitorEnabled(RpcController c,
1232       IsCatalogJanitorEnabledRequest req) throws ServiceException {
1233     boolean isEnabled = catalogJanitorChore != null ? catalogJanitorChore.getEnabled() : false;
1234     return IsCatalogJanitorEnabledResponse.newBuilder().setValue(isEnabled).build();
1235   }
1236 
1237   /**
1238    * @return Maximum time we should run balancer for
1239    */
1240   private int getBalancerCutoffTime() {
1241     int balancerCutoffTime =
1242       getConfiguration().getInt("hbase.balancer.max.balancing", -1);
1243     if (balancerCutoffTime == -1) {
1244       // No time period set so create one -- do half of balancer period.
1245       int balancerPeriod =
1246         getConfiguration().getInt("hbase.balancer.period", 300000);
1247       balancerCutoffTime = balancerPeriod / 2;
1248       // If nonsense period, set it to balancerPeriod
1249       if (balancerCutoffTime <= 0) balancerCutoffTime = balancerPeriod;
1250     }
1251     return balancerCutoffTime;
1252   }
1253 
1254   public boolean balance() throws IOException {
1255     // if master not initialized, don't run balancer.
1256     if (!this.initialized) {
1257       LOG.debug("Master has not been initialized, don't run balancer.");
1258       return false;
1259     }
1260     // If balance not true, don't run balancer.
1261     if (!this.loadBalancerTracker.isBalancerOn()) return false;
1262     // Do this call outside of synchronized block.
1263     int maximumBalanceTime = getBalancerCutoffTime();
1264     long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;
1265     boolean balancerRan;
1266     synchronized (this.balancer) {
1267       // Only allow one balance run at at time.
1268       if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {
1269         Map<String, RegionState> regionsInTransition =
1270           this.assignmentManager.getRegionStates().getRegionsInTransition();
1271         LOG.debug("Not running balancer because " + regionsInTransition.size() +
1272           " region(s) in transition: " + org.apache.commons.lang.StringUtils.
1273             abbreviate(regionsInTransition.toString(), 256));
1274         return false;
1275       }
1276       if (this.serverManager.areDeadServersInProgress()) {
1277         LOG.debug("Not running balancer because processing dead regionserver(s): " +
1278           this.serverManager.getDeadServers());
1279         return false;
1280       }
1281 
1282       if (this.cpHost != null) {
1283         if (this.cpHost.preBalance()) {
1284           LOG.debug("Coprocessor bypassing balancer request");
1285           return false;
1286         }
1287       }
1288 
1289       Map<String, Map<ServerName, List<HRegionInfo>>> assignmentsByTable =
1290         this.assignmentManager.getRegionStates().getAssignmentsByTable();
1291 
1292       List<RegionPlan> plans = new ArrayList<RegionPlan>();
1293       //Give the balancer the current cluster state.
1294       this.balancer.setClusterStatus(getClusterStatus());
1295       for (Map<ServerName, List<HRegionInfo>> assignments : assignmentsByTable.values()) {
1296         List<RegionPlan> partialPlans = this.balancer.balanceCluster(assignments);
1297         if (partialPlans != null) plans.addAll(partialPlans);
1298       }
1299       int rpCount = 0;  // number of RegionPlans balanced so far
1300       long totalRegPlanExecTime = 0;
1301       balancerRan = plans != null;
1302       if (plans != null && !plans.isEmpty()) {
1303         for (RegionPlan plan: plans) {
1304           LOG.info("balance " + plan);
1305           long balStartTime = System.currentTimeMillis();
1306           this.assignmentManager.balance(plan);
1307           totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;
1308           rpCount++;
1309           if (rpCount < plans.size() &&
1310               // if performing next balance exceeds cutoff time, exit the loop
1311               (System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {
1312             LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +
1313               maximumBalanceTime);
1314             break;
1315           }
1316         }
1317       }
1318       if (this.cpHost != null) {
1319         this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);
1320       }
1321     }
1322     return balancerRan;
1323   }
1324 
1325   @Override
1326   public BalanceResponse balance(RpcController c, BalanceRequest request) throws ServiceException {
1327     try {
1328       return BalanceResponse.newBuilder().setBalancerRan(balance()).build();
1329     } catch (IOException e) {
1330       throw new ServiceException(e);
1331     }
1332   }
1333 
1334   enum BalanceSwitchMode {
1335     SYNC,
1336     ASYNC
1337   }
1338   /**
1339    * Assigns balancer switch according to BalanceSwitchMode
1340    * @param b new balancer switch
1341    * @param mode BalanceSwitchMode
1342    * @return old balancer switch
1343    */
1344   public boolean switchBalancer(final boolean b, BalanceSwitchMode mode) throws IOException {
1345     boolean oldValue = this.loadBalancerTracker.isBalancerOn();
1346     boolean newValue = b;
1347     try {
1348       if (this.cpHost != null) {
1349         newValue = this.cpHost.preBalanceSwitch(newValue);
1350       }
1351       try {
1352         if (mode == BalanceSwitchMode.SYNC) {
1353           synchronized (this.balancer) {
1354             this.loadBalancerTracker.setBalancerOn(newValue);
1355           }
1356         } else {
1357           this.loadBalancerTracker.setBalancerOn(newValue);
1358         }
1359       } catch (KeeperException ke) {
1360         throw new IOException(ke);
1361       }
1362       LOG.info("BalanceSwitch=" + newValue);
1363       if (this.cpHost != null) {
1364         this.cpHost.postBalanceSwitch(oldValue, newValue);
1365       }
1366     } catch (IOException ioe) {
1367       LOG.warn("Error flipping balance switch", ioe);
1368     }
1369     return oldValue;
1370   }
1371 
1372   public boolean synchronousBalanceSwitch(final boolean b) throws IOException {
1373     return switchBalancer(b, BalanceSwitchMode.SYNC);
1374   }
1375 
1376   public boolean balanceSwitch(final boolean b) throws IOException {
1377     return switchBalancer(b, BalanceSwitchMode.ASYNC);
1378   }
1379 
1380   @Override
1381   public SetBalancerRunningResponse setBalancerRunning(
1382       RpcController controller, SetBalancerRunningRequest req) throws ServiceException {
1383     try {
1384       boolean prevValue = (req.getSynchronous())?
1385         synchronousBalanceSwitch(req.getOn()):balanceSwitch(req.getOn());
1386       return SetBalancerRunningResponse.newBuilder().setPrevBalanceValue(prevValue).build();
1387     } catch (IOException ioe) {
1388       throw new ServiceException(ioe);
1389     }
1390   }
1391 
1392   /**
1393    * Switch for the background CatalogJanitor thread.
1394    * Used for testing.  The thread will continue to run.  It will just be a noop
1395    * if disabled.
1396    * @param b If false, the catalog janitor won't do anything.
1397    */
1398   public void setCatalogJanitorEnabled(final boolean b) {
1399     this.catalogJanitorChore.setEnabled(b);
1400   }
1401 
1402   @Override
1403   public DispatchMergingRegionsResponse dispatchMergingRegions(
1404       RpcController controller, DispatchMergingRegionsRequest request)
1405       throws ServiceException {
1406     final byte[] encodedNameOfRegionA = request.getRegionA().getValue()
1407         .toByteArray();
1408     final byte[] encodedNameOfRegionB = request.getRegionB().getValue()
1409         .toByteArray();
1410     final boolean forcible = request.getForcible();
1411     if (request.getRegionA().getType() != RegionSpecifierType.ENCODED_REGION_NAME
1412         || request.getRegionB().getType() != RegionSpecifierType.ENCODED_REGION_NAME) {
1413       LOG.warn("mergeRegions specifier type: expected: "
1414           + RegionSpecifierType.ENCODED_REGION_NAME + " actual: region_a="
1415           + request.getRegionA().getType() + ", region_b="
1416           + request.getRegionB().getType());
1417     }
1418     RegionState regionStateA = assignmentManager.getRegionStates()
1419         .getRegionState(Bytes.toString(encodedNameOfRegionA));
1420     RegionState regionStateB = assignmentManager.getRegionStates()
1421         .getRegionState(Bytes.toString(encodedNameOfRegionB));
1422     if (regionStateA == null || regionStateB == null) {
1423       throw new ServiceException(new UnknownRegionException(
1424           Bytes.toStringBinary(regionStateA == null ? encodedNameOfRegionA
1425               : encodedNameOfRegionB)));
1426     }
1427 
1428     if (!forcible && !HRegionInfo.areAdjacent(regionStateA.getRegion(),
1429             regionStateB.getRegion())) {
1430       throw new ServiceException("Unable to merge not adjacent regions "
1431           + regionStateA.getRegion().getRegionNameAsString() + ", "
1432           + regionStateB.getRegion().getRegionNameAsString()
1433           + " where forcible = " + forcible);
1434     }
1435 
1436     try {
1437       dispatchMergingRegions(regionStateA.getRegion(), regionStateB.getRegion(), forcible);
1438     } catch (IOException ioe) {
1439       throw new ServiceException(ioe);
1440     }
1441 
1442     return DispatchMergingRegionsResponse.newBuilder().build();
1443   }
1444 
1445   @Override
1446   public void dispatchMergingRegions(final HRegionInfo region_a,
1447       final HRegionInfo region_b, final boolean forcible) throws IOException {
1448     checkInitialized();
1449     this.executorService.submit(new DispatchMergingRegionHandler(this,
1450         this.catalogJanitorChore, region_a, region_b, forcible));
1451   }
1452 
1453   @Override
1454   public MoveRegionResponse moveRegion(RpcController controller, MoveRegionRequest req)
1455   throws ServiceException {
1456     final byte [] encodedRegionName = req.getRegion().getValue().toByteArray();
1457     RegionSpecifierType type = req.getRegion().getType();
1458     final byte [] destServerName = (req.hasDestServerName())?
1459       Bytes.toBytes(ProtobufUtil.toServerName(req.getDestServerName()).getServerName()):null;
1460     MoveRegionResponse mrr = MoveRegionResponse.newBuilder().build();
1461 
1462     if (type != RegionSpecifierType.ENCODED_REGION_NAME) {
1463       LOG.warn("moveRegion specifier type: expected: " + RegionSpecifierType.ENCODED_REGION_NAME
1464         + " actual: " + type);
1465     }
1466 
1467     try {
1468       move(encodedRegionName, destServerName);
1469     } catch (HBaseIOException ioe) {
1470       throw new ServiceException(ioe);
1471     }
1472     return mrr;
1473   }
1474 
1475   void move(final byte[] encodedRegionName,
1476       final byte[] destServerName) throws HBaseIOException {
1477     RegionState regionState = assignmentManager.getRegionStates().
1478       getRegionState(Bytes.toString(encodedRegionName));
1479     if (regionState == null) {
1480       throw new UnknownRegionException(Bytes.toStringBinary(encodedRegionName));
1481     }
1482 
1483     HRegionInfo hri = regionState.getRegion();
1484     ServerName dest;
1485     if (destServerName == null || destServerName.length == 0) {
1486       LOG.info("Passed destination servername is null/empty so " +
1487         "choosing a server at random");
1488       final List<ServerName> destServers = this.serverManager.createDestinationServersList(
1489         regionState.getServerName());
1490       dest = balancer.randomAssignment(hri, destServers);
1491     } else {
1492       dest = new ServerName(Bytes.toString(destServerName));
1493       if (dest.equals(regionState.getServerName())) {
1494         LOG.debug("Skipping move of region " + hri.getRegionNameAsString()
1495           + " because region already assigned to the same server " + dest + ".");
1496         return;
1497       }
1498     }
1499 
1500     // Now we can do the move
1501     RegionPlan rp = new RegionPlan(hri, regionState.getServerName(), dest);
1502 
1503     try {
1504       checkInitialized();
1505       if (this.cpHost != null) {
1506         if (this.cpHost.preMove(hri, rp.getSource(), rp.getDestination())) {
1507           return;
1508         }
1509       }
1510       LOG.info("Added move plan " + rp + ", running balancer");
1511       this.assignmentManager.balance(rp);
1512       if (this.cpHost != null) {
1513         this.cpHost.postMove(hri, rp.getSource(), rp.getDestination());
1514       }
1515     } catch (IOException ioe) {
1516       if (ioe instanceof HBaseIOException) {
1517         throw (HBaseIOException)ioe;
1518       }
1519       throw new HBaseIOException(ioe);
1520     }
1521   }
1522 
1523   @Override
1524   public void createTable(HTableDescriptor hTableDescriptor,
1525     byte [][] splitKeys)
1526   throws IOException {
1527     if (!isMasterRunning()) {
1528       throw new MasterNotRunningException();
1529     }
1530 
1531     HRegionInfo [] newRegions = getHRegionInfos(hTableDescriptor, splitKeys);
1532     checkInitialized();
1533     checkCompression(hTableDescriptor);
1534     if (cpHost != null) {
1535       cpHost.preCreateTable(hTableDescriptor, newRegions);
1536     }
1537 
1538     this.executorService.submit(new CreateTableHandler(this,
1539       this.fileSystemManager, hTableDescriptor, conf,
1540       newRegions, this).prepare());
1541     if (cpHost != null) {
1542       cpHost.postCreateTable(hTableDescriptor, newRegions);
1543     }
1544 
1545   }
1546 
1547   private void checkCompression(final HTableDescriptor htd)
1548   throws IOException {
1549     if (!this.masterCheckCompression) return;
1550     for (HColumnDescriptor hcd : htd.getColumnFamilies()) {
1551       checkCompression(hcd);
1552     }
1553   }
1554 
1555   private void checkCompression(final HColumnDescriptor hcd)
1556   throws IOException {
1557     if (!this.masterCheckCompression) return;
1558     CompressionTest.testCompression(hcd.getCompression());
1559     CompressionTest.testCompression(hcd.getCompactionCompression());
1560   }
1561 
1562   @Override
1563   public CreateTableResponse createTable(RpcController controller, CreateTableRequest req)
1564   throws ServiceException {
1565     HTableDescriptor hTableDescriptor = HTableDescriptor.convert(req.getTableSchema());
1566     byte [][] splitKeys = ProtobufUtil.getSplitKeysArray(req);
1567     try {
1568       createTable(hTableDescriptor,splitKeys);
1569     } catch (IOException ioe) {
1570       throw new ServiceException(ioe);
1571     }
1572     return CreateTableResponse.newBuilder().build();
1573   }
1574 
1575   private HRegionInfo[] getHRegionInfos(HTableDescriptor hTableDescriptor,
1576     byte[][] splitKeys) {
1577     HRegionInfo[] hRegionInfos = null;
1578     if (splitKeys == null || splitKeys.length == 0) {
1579       hRegionInfos = new HRegionInfo[]{
1580           new HRegionInfo(hTableDescriptor.getName(), null, null)};
1581     } else {
1582       int numRegions = splitKeys.length + 1;
1583       hRegionInfos = new HRegionInfo[numRegions];
1584       byte[] startKey = null;
1585       byte[] endKey = null;
1586       for (int i = 0; i < numRegions; i++) {
1587         endKey = (i == splitKeys.length) ? null : splitKeys[i];
1588         hRegionInfos[i] =
1589             new HRegionInfo(hTableDescriptor.getName(), startKey, endKey);
1590         startKey = endKey;
1591       }
1592     }
1593     return hRegionInfos;
1594   }
1595 
1596   private static boolean isCatalogTable(final byte [] tableName) {
1597     return Bytes.equals(tableName, HConstants.META_TABLE_NAME);
1598   }
1599 
1600   @Override
1601   public void deleteTable(final byte[] tableName) throws IOException {
1602     checkInitialized();
1603     if (cpHost != null) {
1604       cpHost.preDeleteTable(tableName);
1605     }
1606     this.executorService.submit(new DeleteTableHandler(tableName, this, this).prepare());
1607     if (cpHost != null) {
1608       cpHost.postDeleteTable(tableName);
1609     }
1610   }
1611 
1612   @Override
1613   public DeleteTableResponse deleteTable(RpcController controller, DeleteTableRequest request)
1614   throws ServiceException {
1615     try {
1616       deleteTable(request.getTableName().toByteArray());
1617     } catch (IOException ioe) {
1618       throw new ServiceException(ioe);
1619     }
1620     return DeleteTableResponse.newBuilder().build();
1621   }
1622 
1623   /**
1624    * Get the number of regions of the table that have been updated by the alter.
1625    *
1626    * @return Pair indicating the number of regions updated Pair.getFirst is the
1627    *         regions that are yet to be updated Pair.getSecond is the total number
1628    *         of regions of the table
1629    * @throws IOException
1630    */
1631   @Override
1632   public GetSchemaAlterStatusResponse getSchemaAlterStatus(
1633       RpcController controller, GetSchemaAlterStatusRequest req) throws ServiceException {
1634     // TODO: currently, we query using the table name on the client side. this
1635     // may overlap with other table operations or the table operation may
1636     // have completed before querying this API. We need to refactor to a
1637     // transaction system in the future to avoid these ambiguities.
1638     byte [] tableName = req.getTableName().toByteArray();
1639 
1640     try {
1641       Pair<Integer,Integer> pair = this.assignmentManager.getReopenStatus(tableName);
1642       GetSchemaAlterStatusResponse.Builder ret = GetSchemaAlterStatusResponse.newBuilder();
1643       ret.setYetToUpdateRegions(pair.getFirst());
1644       ret.setTotalRegions(pair.getSecond());
1645       return ret.build();
1646     } catch (IOException ioe) {
1647       throw new ServiceException(ioe);
1648     }
1649   }
1650 
1651   @Override
1652   public void addColumn(final byte[] tableName, final HColumnDescriptor column)
1653       throws IOException {
1654     checkInitialized();
1655     if (cpHost != null) {
1656       if (cpHost.preAddColumn(tableName, column)) {
1657         return;
1658       }
1659     }
1660     //TODO: we should process this (and some others) in an executor
1661     new TableAddFamilyHandler(tableName, column, this, this)
1662       .prepare().process();
1663     if (cpHost != null) {
1664       cpHost.postAddColumn(tableName, column);
1665     }
1666   }
1667 
1668   @Override
1669   public AddColumnResponse addColumn(RpcController controller, AddColumnRequest req)
1670   throws ServiceException {
1671     try {
1672       addColumn(req.getTableName().toByteArray(),
1673         HColumnDescriptor.convert(req.getColumnFamilies()));
1674     } catch (IOException ioe) {
1675       throw new ServiceException(ioe);
1676     }
1677     return AddColumnResponse.newBuilder().build();
1678   }
1679 
1680   @Override
1681   public void modifyColumn(byte[] tableName, HColumnDescriptor descriptor)
1682       throws IOException {
1683     checkInitialized();
1684     checkCompression(descriptor);
1685     if (cpHost != null) {
1686       if (cpHost.preModifyColumn(tableName, descriptor)) {
1687         return;
1688       }
1689     }
1690     new TableModifyFamilyHandler(tableName, descriptor, this, this)
1691       .prepare().process();
1692     if (cpHost != null) {
1693       cpHost.postModifyColumn(tableName, descriptor);
1694     }
1695   }
1696 
1697   @Override
1698   public ModifyColumnResponse modifyColumn(RpcController controller, ModifyColumnRequest req)
1699   throws ServiceException {
1700     try {
1701       modifyColumn(req.getTableName().toByteArray(),
1702         HColumnDescriptor.convert(req.getColumnFamilies()));
1703     } catch (IOException ioe) {
1704       throw new ServiceException(ioe);
1705     }
1706     return ModifyColumnResponse.newBuilder().build();
1707   }
1708 
1709   @Override
1710   public void deleteColumn(final byte[] tableName, final byte[] columnName)
1711       throws IOException {
1712     checkInitialized();
1713     if (cpHost != null) {
1714       if (cpHost.preDeleteColumn(tableName, columnName)) {
1715         return;
1716       }
1717     }
1718     new TableDeleteFamilyHandler(tableName, columnName, this, this).prepare().process();
1719     if (cpHost != null) {
1720       cpHost.postDeleteColumn(tableName, columnName);
1721     }
1722   }
1723 
1724   @Override
1725   public DeleteColumnResponse deleteColumn(RpcController controller, DeleteColumnRequest req)
1726   throws ServiceException {
1727     try {
1728       deleteColumn(req.getTableName().toByteArray(), req.getColumnName().toByteArray());
1729     } catch (IOException ioe) {
1730       throw new ServiceException(ioe);
1731     }
1732     return DeleteColumnResponse.newBuilder().build();
1733   }
1734 
1735   @Override
1736   public void enableTable(final byte[] tableName) throws IOException {
1737     checkInitialized();
1738     if (cpHost != null) {
1739       cpHost.preEnableTable(tableName);
1740     }
1741     this.executorService.submit(new EnableTableHandler(this, tableName,
1742       catalogTracker, assignmentManager, tableLockManager, false).prepare());
1743     if (cpHost != null) {
1744       cpHost.postEnableTable(tableName);
1745    }
1746   }
1747 
1748   @Override
1749   public EnableTableResponse enableTable(RpcController controller, EnableTableRequest request)
1750   throws ServiceException {
1751     try {
1752       enableTable(request.getTableName().toByteArray());
1753     } catch (IOException ioe) {
1754       throw new ServiceException(ioe);
1755     }
1756     return EnableTableResponse.newBuilder().build();
1757   }
1758 
1759   @Override
1760   public void disableTable(final byte[] tableName) throws IOException {
1761     checkInitialized();
1762     if (cpHost != null) {
1763       cpHost.preDisableTable(tableName);
1764     }
1765     this.executorService.submit(new DisableTableHandler(this, tableName,
1766       catalogTracker, assignmentManager, tableLockManager, false).prepare());
1767     if (cpHost != null) {
1768       cpHost.postDisableTable(tableName);
1769     }
1770   }
1771 
1772   @Override
1773   public DisableTableResponse disableTable(RpcController controller, DisableTableRequest request)
1774   throws ServiceException {
1775     try {
1776       disableTable(request.getTableName().toByteArray());
1777     } catch (IOException ioe) {
1778       throw new ServiceException(ioe);
1779     }
1780     return DisableTableResponse.newBuilder().build();
1781   }
1782 
1783   /**
1784    * Return the region and current deployment for the region containing
1785    * the given row. If the region cannot be found, returns null. If it
1786    * is found, but not currently deployed, the second element of the pair
1787    * may be null.
1788    */
1789   Pair<HRegionInfo, ServerName> getTableRegionForRow(
1790       final byte [] tableName, final byte [] rowKey)
1791   throws IOException {
1792     final AtomicReference<Pair<HRegionInfo, ServerName>> result =
1793       new AtomicReference<Pair<HRegionInfo, ServerName>>(null);
1794 
1795     MetaScannerVisitor visitor =
1796       new MetaScannerVisitorBase() {
1797         @Override
1798         public boolean processRow(Result data) throws IOException {
1799           if (data == null || data.size() <= 0) {
1800             return true;
1801           }
1802           Pair<HRegionInfo, ServerName> pair = HRegionInfo.getHRegionInfoAndServerName(data);
1803           if (pair == null) {
1804             return false;
1805           }
1806           if (!Bytes.equals(pair.getFirst().getTableName(), tableName)) {
1807             return false;
1808           }
1809           result.set(pair);
1810           return true;
1811         }
1812     };
1813 
1814     MetaScanner.metaScan(conf, visitor, tableName, rowKey, 1);
1815     return result.get();
1816   }
1817 
1818   @Override
1819   public void modifyTable(final byte[] tableName, final HTableDescriptor descriptor)
1820       throws IOException {
1821     checkInitialized();
1822     checkCompression(descriptor);
1823     if (cpHost != null) {
1824       cpHost.preModifyTable(tableName, descriptor);
1825     }
1826     new ModifyTableHandler(tableName, descriptor, this, this).prepare().process();
1827     if (cpHost != null) {
1828       cpHost.postModifyTable(tableName, descriptor);
1829     }
1830   }
1831 
1832   @Override
1833   public ModifyTableResponse modifyTable(RpcController controller, ModifyTableRequest req)
1834   throws ServiceException {
1835     try {
1836       modifyTable(req.getTableName().toByteArray(),
1837         HTableDescriptor.convert(req.getTableSchema()));
1838     } catch (IOException ioe) {
1839       throw new ServiceException(ioe);
1840     }
1841     return ModifyTableResponse.newBuilder().build();
1842   }
1843 
1844   @Override
1845   public void checkTableModifiable(final byte [] tableName)
1846       throws IOException, TableNotFoundException, TableNotDisabledException {
1847     String tableNameStr = Bytes.toString(tableName);
1848     if (isCatalogTable(tableName)) {
1849       throw new IOException("Can't modify catalog tables");
1850     }
1851     if (!MetaReader.tableExists(getCatalogTracker(), tableNameStr)) {
1852       throw new TableNotFoundException(tableNameStr);
1853     }
1854     if (!getAssignmentManager().getZKTable().
1855         isDisabledTable(Bytes.toString(tableName))) {
1856       throw new TableNotDisabledException(tableName);
1857     }
1858   }
1859 
1860   @Override
1861   public GetClusterStatusResponse getClusterStatus(RpcController controller,
1862       GetClusterStatusRequest req)
1863   throws ServiceException {
1864     GetClusterStatusResponse.Builder response = GetClusterStatusResponse.newBuilder();
1865     response.setClusterStatus(getClusterStatus().convert());
1866     return response.build();
1867   }
1868 
1869   /**
1870    * @return cluster status
1871    */
1872   public ClusterStatus getClusterStatus() {
1873     // Build Set of backup masters from ZK nodes
1874     List<String> backupMasterStrings;
1875     try {
1876       backupMasterStrings = ZKUtil.listChildrenNoWatch(this.zooKeeper,
1877         this.zooKeeper.backupMasterAddressesZNode);
1878     } catch (KeeperException e) {
1879       LOG.warn(this.zooKeeper.prefix("Unable to list backup servers"), e);
1880       backupMasterStrings = new ArrayList<String>(0);
1881     }
1882     List<ServerName> backupMasters = new ArrayList<ServerName>(
1883                                           backupMasterStrings.size());
1884     for (String s: backupMasterStrings) {
1885       try {
1886         byte [] bytes =
1887             ZKUtil.getData(this.zooKeeper, ZKUtil.joinZNode(
1888                 this.zooKeeper.backupMasterAddressesZNode, s));
1889         if (bytes != null) {
1890           ServerName sn;
1891           try {
1892             sn = ServerName.parseFrom(bytes);
1893           } catch (DeserializationException e) {
1894             LOG.warn("Failed parse, skipping registering backup server", e);
1895             continue;
1896           }
1897           backupMasters.add(sn);
1898         }
1899       } catch (KeeperException e) {
1900         LOG.warn(this.zooKeeper.prefix("Unable to get information about " +
1901                  "backup servers"), e);
1902       }
1903     }
1904     Collections.sort(backupMasters, new Comparator<ServerName>() {
1905       public int compare(ServerName s1, ServerName s2) {
1906         return s1.getServerName().compareTo(s2.getServerName());
1907       }});
1908 
1909     return new ClusterStatus(VersionInfo.getVersion(),
1910       this.fileSystemManager.getClusterId().toString(),
1911       this.serverManager.getOnlineServers(),
1912       this.serverManager.getDeadServers().copyServerNames(),
1913       this.serverName,
1914       backupMasters,
1915       this.assignmentManager.getRegionStates().getRegionsInTransition(),
1916       this.getCoprocessors(), this.loadBalancerTracker.isBalancerOn());
1917   }
1918 
1919   public String getClusterId() {
1920     if (fileSystemManager == null) {
1921       return "";
1922     }
1923     ClusterId id = fileSystemManager.getClusterId();
1924     if (id == null) {
1925       return "";
1926     }
1927     return id.toString();
1928   }
1929 
1930   /**
1931    * The set of loaded coprocessors is stored in a static set. Since it's
1932    * statically allocated, it does not require that HMaster's cpHost be
1933    * initialized prior to accessing it.
1934    * @return a String representation of the set of names of the loaded
1935    * coprocessors.
1936    */
1937   public static String getLoadedCoprocessors() {
1938     return CoprocessorHost.getLoadedCoprocessors().toString();
1939   }
1940 
1941   /**
1942    * @return timestamp in millis when HMaster was started.
1943    */
1944   public long getMasterStartTime() {
1945     return masterStartTime;
1946   }
1947 
1948   /**
1949    * @return timestamp in millis when HMaster became the active master.
1950    */
1951   public long getMasterActiveTime() {
1952     return masterActiveTime;
1953   }
1954 
1955   /**
1956    * @return array of coprocessor SimpleNames.
1957    */
1958   public String[] getCoprocessors() {
1959     Set<String> masterCoprocessors =
1960         getCoprocessorHost().getCoprocessors();
1961     return masterCoprocessors.toArray(new String[masterCoprocessors.size()]);
1962   }
1963 
1964   @Override
1965   public void abort(final String msg, final Throwable t) {
1966     if (cpHost != null) {
1967       // HBASE-4014: dump a list of loaded coprocessors.
1968       LOG.fatal("Master server abort: loaded coprocessors are: " +
1969           getLoadedCoprocessors());
1970     }
1971 
1972     if (abortNow(msg, t)) {
1973       if (t != null) LOG.fatal(msg, t);
1974       else LOG.fatal(msg);
1975       this.abort = true;
1976       stop("Aborting");
1977     }
1978   }
1979 
1980   /**
1981    * We do the following in a different thread.  If it is not completed
1982    * in time, we will time it out and assume it is not easy to recover.
1983    *
1984    * 1. Create a new ZK session. (since our current one is expired)
1985    * 2. Try to become a primary master again
1986    * 3. Initialize all ZK based system trackers.
1987    * 4. Assign meta. (they are already assigned, but we need to update our
1988    * internal memory state to reflect it)
1989    * 5. Process any RIT if any during the process of our recovery.
1990    *
1991    * @return True if we could successfully recover from ZK session expiry.
1992    * @throws InterruptedException
1993    * @throws IOException
1994    * @throws KeeperException
1995    * @throws ExecutionException
1996    */
1997   private boolean tryRecoveringExpiredZKSession() throws InterruptedException,
1998       IOException, KeeperException, ExecutionException {
1999 
2000     this.zooKeeper.reconnectAfterExpiration();
2001 
2002     Callable<Boolean> callable = new Callable<Boolean> () {
2003       public Boolean call() throws InterruptedException,
2004           IOException, KeeperException {
2005         MonitoredTask status =
2006           TaskMonitor.get().createStatus("Recovering expired ZK session");
2007         try {
2008           if (!becomeActiveMaster(status)) {
2009             return Boolean.FALSE;
2010           }
2011           serverShutdownHandlerEnabled = false;
2012           initialized = false;
2013           finishInitialization(status, true);
2014           return Boolean.TRUE;
2015         } finally {
2016           status.cleanup();
2017         }
2018       }
2019     };
2020 
2021     long timeout =
2022       conf.getLong("hbase.master.zksession.recover.timeout", 300000);
2023     java.util.concurrent.ExecutorService executor =
2024       Executors.newSingleThreadExecutor();
2025     Future<Boolean> result = executor.submit(callable);
2026     executor.shutdown();
2027     if (executor.awaitTermination(timeout, TimeUnit.MILLISECONDS)
2028         && result.isDone()) {
2029       Boolean recovered = result.get();
2030       if (recovered != null) {
2031         return recovered.booleanValue();
2032       }
2033     }
2034     executor.shutdownNow();
2035     return false;
2036   }
2037 
2038   /**
2039    * Check to see if the current trigger for abort is due to ZooKeeper session
2040    * expiry, and If yes, whether we can recover from ZK session expiry.
2041    *
2042    * @param msg Original abort message
2043    * @param t   The cause for current abort request
2044    * @return true if we should proceed with abort operation, false other wise.
2045    */
2046   private boolean abortNow(final String msg, final Throwable t) {
2047     if (!this.isActiveMaster) {
2048       return true;
2049     }
2050     if (t != null && t instanceof KeeperException.SessionExpiredException) {
2051       try {
2052         LOG.info("Primary Master trying to recover from ZooKeeper session " +
2053             "expiry.");
2054         return !tryRecoveringExpiredZKSession();
2055       } catch (Throwable newT) {
2056         LOG.error("Primary master encountered unexpected exception while " +
2057             "trying to recover from ZooKeeper session" +
2058             " expiry. Proceeding with server abort.", newT);
2059       }
2060     }
2061     return true;
2062   }
2063 
2064   @Override
2065   public ZooKeeperWatcher getZooKeeper() {
2066     return zooKeeper;
2067   }
2068 
2069   @Override
2070   public MasterCoprocessorHost getCoprocessorHost() {
2071     return cpHost;
2072   }
2073 
2074   @Override
2075   public ServerName getServerName() {
2076     return this.serverName;
2077   }
2078 
2079   @Override
2080   public CatalogTracker getCatalogTracker() {
2081     return catalogTracker;
2082   }
2083 
2084   @Override
2085   public AssignmentManager getAssignmentManager() {
2086     return this.assignmentManager;
2087   }
2088 
2089   @Override
2090   public TableLockManager getTableLockManager() {
2091     return this.tableLockManager;
2092   }
2093 
2094   public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
2095     return rsFatals;
2096   }
2097 
2098   public void shutdown() throws IOException {
2099     if (spanReceiverHost != null) {
2100       spanReceiverHost.closeReceivers();
2101     }
2102     if (cpHost != null) {
2103       cpHost.preShutdown();
2104     }
2105     if (mxBean != null) {
2106       MBeanUtil.unregisterMBean(mxBean);
2107       mxBean = null;
2108     }
2109     if (this.assignmentManager != null) this.assignmentManager.shutdown();
2110     if (this.serverManager != null) this.serverManager.shutdownCluster();
2111     try {
2112       if (this.clusterStatusTracker != null){
2113         this.clusterStatusTracker.setClusterDown();
2114       }
2115     } catch (KeeperException e) {
2116       LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
2117     }
2118   }
2119 
2120   @Override
2121   public ShutdownResponse shutdown(RpcController controller, ShutdownRequest request)
2122   throws ServiceException {
2123     try {
2124       shutdown();
2125     } catch (IOException e) {
2126       throw new ServiceException(e);
2127     }
2128     return ShutdownResponse.newBuilder().build();
2129   }
2130 
2131   public void stopMaster() throws IOException {
2132     if (cpHost != null) {
2133       cpHost.preStopMaster();
2134     }
2135     stop("Stopped by " + Thread.currentThread().getName());
2136   }
2137 
2138   @Override
2139   public StopMasterResponse stopMaster(RpcController controller, StopMasterRequest request)
2140   throws ServiceException {
2141     try {
2142       stopMaster();
2143     } catch (IOException e) {
2144       throw new ServiceException(e);
2145     }
2146     return StopMasterResponse.newBuilder().build();
2147   }
2148 
2149   @Override
2150   public void stop(final String why) {
2151     LOG.info(why);
2152     this.stopped = true;
2153     // We wake up the stopSleeper to stop immediately
2154     stopSleeper.skipSleepCycle();
2155     // If we are a backup master, we need to interrupt wait
2156     if (this.activeMasterManager != null) {
2157       synchronized (this.activeMasterManager.clusterHasActiveMaster) {
2158         this.activeMasterManager.clusterHasActiveMaster.notifyAll();
2159       }
2160     }
2161   }
2162 
2163   @Override
2164   public boolean isStopped() {
2165     return this.stopped;
2166   }
2167 
2168   public boolean isAborted() {
2169     return this.abort;
2170   }
2171 
2172   void checkInitialized() throws PleaseHoldException {
2173     if (!this.initialized) {
2174       throw new PleaseHoldException("Master is initializing");
2175     }
2176   }
2177 
2178   /**
2179    * Report whether this master is currently the active master or not.
2180    * If not active master, we are parked on ZK waiting to become active.
2181    *
2182    * This method is used for testing.
2183    *
2184    * @return true if active master, false if not.
2185    */
2186   public boolean isActiveMaster() {
2187     return isActiveMaster;
2188   }
2189 
2190   /**
2191    * Report whether this master has completed with its initialization and is
2192    * ready.  If ready, the master is also the active master.  A standby master
2193    * is never ready.
2194    *
2195    * This method is used for testing.
2196    *
2197    * @return true if master is ready to go, false if not.
2198    */
2199   public boolean isInitialized() {
2200     return initialized;
2201   }
2202 
2203   /**
2204    * ServerShutdownHandlerEnabled is set false before completing
2205    * assignMeta to prevent processing of ServerShutdownHandler.
2206    * @return true if assignMeta has completed;
2207    */
2208   public boolean isServerShutdownHandlerEnabled() {
2209     return this.serverShutdownHandlerEnabled;
2210   }
2211 
2212   @Override
2213   public AssignRegionResponse assignRegion(RpcController controller, AssignRegionRequest req)
2214   throws ServiceException {
2215     try {
2216       final byte [] regionName = req.getRegion().getValue().toByteArray();
2217       RegionSpecifierType type = req.getRegion().getType();
2218       AssignRegionResponse arr = AssignRegionResponse.newBuilder().build();
2219 
2220       checkInitialized();
2221       if (type != RegionSpecifierType.REGION_NAME) {
2222         LOG.warn("assignRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2223           + " actual: " + type);
2224       }
2225       HRegionInfo regionInfo = assignmentManager.getRegionStates().getRegionInfo(regionName);
2226       if (regionInfo == null) throw new UnknownRegionException(Bytes.toString(regionName));
2227       if (cpHost != null) {
2228         if (cpHost.preAssign(regionInfo)) {
2229           return arr;
2230         }
2231       }
2232       assignmentManager.assign(regionInfo, true, true);
2233       if (cpHost != null) {
2234         cpHost.postAssign(regionInfo);
2235       }
2236 
2237       return arr;
2238     } catch (IOException ioe) {
2239       throw new ServiceException(ioe);
2240     }
2241   }
2242 
2243   public void assignRegion(HRegionInfo hri) {
2244     assignmentManager.assign(hri, true);
2245   }
2246 
2247   @Override
2248   public UnassignRegionResponse unassignRegion(RpcController controller, UnassignRegionRequest req)
2249   throws ServiceException {
2250     try {
2251       final byte [] regionName = req.getRegion().getValue().toByteArray();
2252       RegionSpecifierType type = req.getRegion().getType();
2253       final boolean force = req.getForce();
2254       UnassignRegionResponse urr = UnassignRegionResponse.newBuilder().build();
2255 
2256       checkInitialized();
2257       if (type != RegionSpecifierType.REGION_NAME) {
2258         LOG.warn("unassignRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2259           + " actual: " + type);
2260       }
2261       Pair<HRegionInfo, ServerName> pair =
2262         MetaReader.getRegion(this.catalogTracker, regionName);
2263       if (pair == null) throw new UnknownRegionException(Bytes.toString(regionName));
2264       HRegionInfo hri = pair.getFirst();
2265       if (cpHost != null) {
2266         if (cpHost.preUnassign(hri, force)) {
2267           return urr;
2268         }
2269       }
2270       if (force) {
2271         this.assignmentManager.regionOffline(hri);
2272         assignRegion(hri);
2273       } else {
2274         this.assignmentManager.unassign(hri, force);
2275       }
2276       if (cpHost != null) {
2277         cpHost.postUnassign(hri, force);
2278       }
2279 
2280       return urr;
2281     } catch (IOException ioe) {
2282       throw new ServiceException(ioe);
2283     }
2284   }
2285 
2286   /**
2287    * Get list of TableDescriptors for requested tables.
2288    * @param controller Unused (set to null).
2289    * @param req GetTableDescriptorsRequest that contains:
2290    * - tableNames: requested tables, or if empty, all are requested
2291    * @return GetTableDescriptorsResponse
2292    * @throws ServiceException
2293    */
2294   public GetTableDescriptorsResponse getTableDescriptors(
2295 	      RpcController controller, GetTableDescriptorsRequest req) throws ServiceException {
2296     GetTableDescriptorsResponse.Builder builder = GetTableDescriptorsResponse.newBuilder();
2297     if (req.getTableNamesCount() == 0) {
2298       // request for all TableDescriptors
2299       Map<String, HTableDescriptor> descriptors = null;
2300       try {
2301         descriptors = this.tableDescriptors.getAll();
2302       } catch (IOException e) {
2303           LOG.warn("Failed getting all descriptors", e);
2304       }
2305       if (descriptors != null) {
2306         for (HTableDescriptor htd : descriptors.values()) {
2307           builder.addTableSchema(htd.convert());
2308         }
2309       }
2310     }
2311     else {
2312       for (String s: req.getTableNamesList()) {
2313         HTableDescriptor htd = null;
2314         try {
2315           htd = this.tableDescriptors.get(s);
2316         } catch (IOException e) {
2317           LOG.warn("Failed getting descriptor for " + s, e);
2318         }
2319         if (htd == null) continue;
2320         builder.addTableSchema(htd.convert());
2321       }
2322     }
2323     return builder.build();
2324   }
2325 
2326   /**
2327    * Compute the average load across all region servers.
2328    * Currently, this uses a very naive computation - just uses the number of
2329    * regions being served, ignoring stats about number of requests.
2330    * @return the average load
2331    */
2332   public double getAverageLoad() {
2333     if (this.assignmentManager == null) {
2334       return 0;
2335     }
2336 
2337     RegionStates regionStates = this.assignmentManager.getRegionStates();
2338     if (regionStates == null) {
2339       return 0;
2340     }
2341     return regionStates.getAverageLoad();
2342   }
2343 
2344   /**
2345    * Offline specified region from master's in-memory state. It will not attempt to
2346    * reassign the region as in unassign.
2347    *  
2348    * This is a special method that should be used by experts or hbck.
2349    * 
2350    */
2351   @Override
2352   public OfflineRegionResponse offlineRegion(RpcController controller, OfflineRegionRequest request)
2353   throws ServiceException {
2354     final byte [] regionName = request.getRegion().getValue().toByteArray();
2355     RegionSpecifierType type = request.getRegion().getType();
2356     if (type != RegionSpecifierType.REGION_NAME) {
2357       LOG.warn("moveRegion specifier type: expected: " + RegionSpecifierType.REGION_NAME
2358         + " actual: " + type);
2359     }
2360 
2361     try {
2362       Pair<HRegionInfo, ServerName> pair =
2363         MetaReader.getRegion(this.catalogTracker, regionName);
2364       if (pair == null) throw new UnknownRegionException(Bytes.toStringBinary(regionName));
2365       HRegionInfo hri = pair.getFirst();
2366       if (cpHost != null) {
2367         cpHost.preRegionOffline(hri);
2368       }
2369       this.assignmentManager.regionOffline(hri);
2370       if (cpHost != null) {
2371         cpHost.postRegionOffline(hri);
2372       }
2373     } catch (IOException ioe) {
2374       throw new ServiceException(ioe);
2375     }
2376     return OfflineRegionResponse.newBuilder().build();
2377   }
2378 
2379   @Override
2380   public boolean registerService(Service instance) {
2381     /*
2382      * No stacking of instances is allowed for a single service name
2383      */
2384     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
2385     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
2386       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
2387           " already registered, rejecting request from "+instance
2388       );
2389       return false;
2390     }
2391 
2392     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
2393     if (LOG.isDebugEnabled()) {
2394       LOG.debug("Registered master coprocessor service: service="+serviceDesc.getFullName());
2395     }
2396     return true;
2397   }
2398 
2399   @Override
2400   public ClientProtos.CoprocessorServiceResponse execMasterService(final RpcController controller,
2401       final ClientProtos.CoprocessorServiceRequest request) throws ServiceException {
2402     try {
2403       ServerRpcController execController = new ServerRpcController();
2404 
2405       ClientProtos.CoprocessorServiceCall call = request.getCall();
2406       String serviceName = call.getServiceName();
2407       String methodName = call.getMethodName();
2408       if (!coprocessorServiceHandlers.containsKey(serviceName)) {
2409         throw new UnknownProtocolException(null,
2410             "No registered master coprocessor service found for name "+serviceName);
2411       }
2412 
2413       Service service = coprocessorServiceHandlers.get(serviceName);
2414       Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
2415       Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
2416       if (methodDesc == null) {
2417         throw new UnknownProtocolException(service.getClass(),
2418             "Unknown method "+methodName+" called on master service "+serviceName);
2419       }
2420 
2421       //invoke the method
2422       Message execRequest = service.getRequestPrototype(methodDesc).newBuilderForType()
2423           .mergeFrom(call.getRequest()).build();
2424       final Message.Builder responseBuilder =
2425           service.getResponsePrototype(methodDesc).newBuilderForType();
2426       service.callMethod(methodDesc, execController, execRequest, new RpcCallback<Message>() {
2427         @Override
2428         public void run(Message message) {
2429           if (message != null) {
2430             responseBuilder.mergeFrom(message);
2431           }
2432         }
2433       });
2434       Message execResult = responseBuilder.build();
2435 
2436       if (execController.getFailedOn() != null) {
2437         throw execController.getFailedOn();
2438       }
2439       ClientProtos.CoprocessorServiceResponse.Builder builder =
2440           ClientProtos.CoprocessorServiceResponse.newBuilder();
2441       builder.setRegion(RequestConverter.buildRegionSpecifier(
2442           RegionSpecifierType.REGION_NAME, HConstants.EMPTY_BYTE_ARRAY));
2443       builder.setValue(
2444           builder.getValueBuilder().setName(execResult.getClass().getName())
2445               .setValue(execResult.toByteString()));
2446       return builder.build();
2447     } catch (IOException ie) {
2448       throw new ServiceException(ie);
2449     }
2450   }
2451 
2452   /**
2453    * Utility for constructing an instance of the passed HMaster class.
2454    * @param masterClass
2455    * @param conf
2456    * @return HMaster instance.
2457    */
2458   public static HMaster constructMaster(Class<? extends HMaster> masterClass,
2459       final Configuration conf)  {
2460     try {
2461       Constructor<? extends HMaster> c =
2462         masterClass.getConstructor(Configuration.class);
2463       return c.newInstance(conf);
2464     } catch (InvocationTargetException ite) {
2465       Throwable target = ite.getTargetException() != null?
2466         ite.getTargetException(): ite;
2467       if (target.getCause() != null) target = target.getCause();
2468       throw new RuntimeException("Failed construction of Master: " +
2469         masterClass.toString(), target);
2470     } catch (Exception e) {
2471       throw new RuntimeException("Failed construction of Master: " +
2472         masterClass.toString() + ((e.getCause() != null)?
2473           e.getCause().getMessage(): ""), e);
2474     }
2475   }
2476 
2477   /**
2478    * @see org.apache.hadoop.hbase.master.HMasterCommandLine
2479    */
2480   public static void main(String [] args) {
2481     VersionInfo.logVersion();
2482     new HMasterCommandLine(HMaster.class).doMain(args);
2483   }
2484 
2485   public HFileCleaner getHFileCleaner() {
2486     return this.hfileCleaner;
2487   }
2488 
2489   /**
2490    * Exposed for TESTING!
2491    * @return the underlying snapshot manager
2492    */
2493   public SnapshotManager getSnapshotManagerForTesting() {
2494     return this.snapshotManager;
2495   }
2496 
2497   /**
2498    * Triggers an asynchronous attempt to take a snapshot.
2499    * {@inheritDoc}
2500    */
2501   @Override
2502   public TakeSnapshotResponse snapshot(RpcController controller, TakeSnapshotRequest request)
2503       throws ServiceException {
2504     try {
2505       this.snapshotManager.checkSnapshotSupport();
2506     } catch (UnsupportedOperationException e) {
2507       throw new ServiceException(e);
2508     }
2509 
2510     LOG.debug("Submitting snapshot request for:" +
2511         ClientSnapshotDescriptionUtils.toString(request.getSnapshot()));
2512     // get the snapshot information
2513     SnapshotDescription snapshot = SnapshotDescriptionUtils.validate(request.getSnapshot(),
2514       this.conf);
2515     try {
2516       snapshotManager.takeSnapshot(snapshot);
2517     } catch (IOException e) {
2518       throw new ServiceException(e);
2519     }
2520 
2521     // send back the max amount of time the client should wait for the snapshot to complete
2522     long waitTime = SnapshotDescriptionUtils.getMaxMasterTimeout(conf, snapshot.getType(),
2523       SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME);
2524     return TakeSnapshotResponse.newBuilder().setExpectedTimeout(waitTime).build();
2525   }
2526 
2527   /**
2528    * List the currently available/stored snapshots. Any in-progress snapshots are ignored
2529    */
2530   @Override
2531   public ListSnapshotResponse getCompletedSnapshots(RpcController controller,
2532       ListSnapshotRequest request) throws ServiceException {
2533     try {
2534       ListSnapshotResponse.Builder builder = ListSnapshotResponse.newBuilder();
2535       List<SnapshotDescription> snapshots = snapshotManager.getCompletedSnapshots();
2536 
2537       // convert to protobuf
2538       for (SnapshotDescription snapshot : snapshots) {
2539         builder.addSnapshots(snapshot);
2540       }
2541       return builder.build();
2542     } catch (IOException e) {
2543       throw new ServiceException(e);
2544     }
2545   }
2546 
2547   /**
2548    * Execute Delete Snapshot operation.
2549    * @return DeleteSnapshotResponse (a protobuf wrapped void) if the snapshot existed and was
2550    *    deleted properly.
2551    * @throws ServiceException wrapping SnapshotDoesNotExistException if specified snapshot did not
2552    *    exist.
2553    */
2554   @Override
2555   public DeleteSnapshotResponse deleteSnapshot(RpcController controller,
2556       DeleteSnapshotRequest request) throws ServiceException {
2557     try {
2558       this.snapshotManager.checkSnapshotSupport();
2559     } catch (UnsupportedOperationException e) {
2560       throw new ServiceException(e);
2561     }
2562 
2563     try {
2564       snapshotManager.deleteSnapshot(request.getSnapshot());
2565       return DeleteSnapshotResponse.newBuilder().build();
2566     } catch (IOException e) {
2567       throw new ServiceException(e);
2568     }
2569   }
2570 
2571   /**
2572    * Checks if the specified snapshot is done.
2573    * @return true if the snapshot is in file system ready to use,
2574    *   false if the snapshot is in the process of completing
2575    * @throws ServiceException wrapping UnknownSnapshotException if invalid snapshot, or
2576    *  a wrapped HBaseSnapshotException with progress failure reason.
2577    */
2578   @Override
2579   public IsSnapshotDoneResponse isSnapshotDone(RpcController controller,
2580       IsSnapshotDoneRequest request) throws ServiceException {
2581     LOG.debug("Checking to see if snapshot from request:" +
2582         ClientSnapshotDescriptionUtils.toString(request.getSnapshot()) + " is done");
2583     try {
2584       IsSnapshotDoneResponse.Builder builder = IsSnapshotDoneResponse.newBuilder();
2585       boolean done = snapshotManager.isSnapshotDone(request.getSnapshot());
2586       builder.setDone(done);
2587       return builder.build();
2588     } catch (IOException e) {
2589       throw new ServiceException(e);
2590     }
2591   }
2592 
2593   /**
2594    * Execute Restore/Clone snapshot operation.
2595    *
2596    * <p>If the specified table exists a "Restore" is executed, replacing the table
2597    * schema and directory data with the content of the snapshot.
2598    * The table must be disabled, or a UnsupportedOperationException will be thrown.
2599    *
2600    * <p>If the table doesn't exist a "Clone" is executed, a new table is created
2601    * using the schema at the time of the snapshot, and the content of the snapshot.
2602    *
2603    * <p>The restore/clone operation does not require copying HFiles. Since HFiles
2604    * are immutable the table can point to and use the same files as the original one.
2605    */
2606   @Override
2607   public RestoreSnapshotResponse restoreSnapshot(RpcController controller,
2608       RestoreSnapshotRequest request) throws ServiceException {
2609     try {
2610       this.snapshotManager.checkSnapshotSupport();
2611     } catch (UnsupportedOperationException e) {
2612       throw new ServiceException(e);
2613     }
2614 
2615     try {
2616       SnapshotDescription reqSnapshot = request.getSnapshot();
2617       snapshotManager.restoreSnapshot(reqSnapshot);
2618       return RestoreSnapshotResponse.newBuilder().build();
2619     } catch (IOException e) {
2620       throw new ServiceException(e);
2621     }
2622   }
2623 
2624   /**
2625    * Returns the status of the requested snapshot restore/clone operation.
2626    * This method is not exposed to the user, it is just used internally by HBaseAdmin
2627    * to verify if the restore is completed.
2628    *
2629    * No exceptions are thrown if the restore is not running, the result will be "done".
2630    *
2631    * @return done <tt>true</tt> if the restore/clone operation is completed.
2632    * @throws ServiceException if the operation failed.
2633    */
2634   @Override
2635   public IsRestoreSnapshotDoneResponse isRestoreSnapshotDone(RpcController controller,
2636       IsRestoreSnapshotDoneRequest request) throws ServiceException {
2637     try {
2638       SnapshotDescription snapshot = request.getSnapshot();
2639       IsRestoreSnapshotDoneResponse.Builder builder = IsRestoreSnapshotDoneResponse.newBuilder();
2640       boolean isRestoring = snapshotManager.isRestoringTable(snapshot);
2641       builder.setDone(!isRestoring);
2642       return builder.build();
2643     } catch (IOException e) {
2644       throw new ServiceException(e);
2645     }
2646   }
2647 
2648   private boolean isHealthCheckerConfigured() {
2649     String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
2650     return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
2651   }
2652 }