View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.IOException;
22  import java.io.InterruptedIOException;
23  import java.lang.Thread.UncaughtExceptionHandler;
24  import java.lang.management.ManagementFactory;
25  import java.lang.management.MemoryUsage;
26  import java.lang.reflect.Constructor;
27  import java.net.BindException;
28  import java.net.InetAddress;
29  import java.net.InetSocketAddress;
30  import java.util.ArrayList;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.Comparator;
34  import java.util.HashMap;
35  import java.util.HashSet;
36  import java.util.Iterator;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Map.Entry;
40  import java.util.Set;
41  import java.util.SortedMap;
42  import java.util.TreeMap;
43  import java.util.TreeSet;
44  import java.util.concurrent.ConcurrentHashMap;
45  import java.util.concurrent.ConcurrentMap;
46  import java.util.concurrent.ConcurrentSkipListMap;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  import java.util.concurrent.atomic.AtomicReference;
49  import java.util.concurrent.locks.ReentrantReadWriteLock;
50  
51  import javax.management.MalformedObjectNameException;
52  import javax.management.ObjectName;
53  import javax.servlet.http.HttpServlet;
54  
55  import org.apache.commons.lang.math.RandomUtils;
56  import org.apache.commons.logging.Log;
57  import org.apache.commons.logging.LogFactory;
58  import org.apache.hadoop.conf.Configuration;
59  import org.apache.hadoop.fs.FileSystem;
60  import org.apache.hadoop.fs.Path;
61  import org.apache.hadoop.hbase.Chore;
62  import org.apache.hadoop.hbase.ClockOutOfSyncException;
63  import org.apache.hadoop.hbase.CoordinatedStateManager;
64  import org.apache.hadoop.hbase.CoordinatedStateManagerFactory;
65  import org.apache.hadoop.hbase.HBaseConfiguration;
66  import org.apache.hadoop.hbase.HBaseInterfaceAudience;
67  import org.apache.hadoop.hbase.HConstants;
68  import org.apache.hadoop.hbase.HRegionInfo;
69  import org.apache.hadoop.hbase.HealthCheckChore;
70  import org.apache.hadoop.hbase.MetaTableAccessor;
71  import org.apache.hadoop.hbase.NotServingRegionException;
72  import org.apache.hadoop.hbase.RemoteExceptionHandler;
73  import org.apache.hadoop.hbase.ServerName;
74  import org.apache.hadoop.hbase.Stoppable;
75  import org.apache.hadoop.hbase.TableDescriptors;
76  import org.apache.hadoop.hbase.TableName;
77  import org.apache.hadoop.hbase.YouAreDeadException;
78  import org.apache.hadoop.hbase.ZNodeClearer;
79  import org.apache.hadoop.hbase.classification.InterfaceAudience;
80  import org.apache.hadoop.hbase.client.ClusterConnection;
81  import org.apache.hadoop.hbase.client.ConnectionFactory;
82  import org.apache.hadoop.hbase.client.ConnectionUtils;
83  import org.apache.hadoop.hbase.conf.ConfigurationManager;
84  import org.apache.hadoop.hbase.coordination.BaseCoordinatedStateManager;
85  import org.apache.hadoop.hbase.coordination.CloseRegionCoordination;
86  import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination;
87  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
88  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
89  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
90  import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
91  import org.apache.hadoop.hbase.executor.ExecutorService;
92  import org.apache.hadoop.hbase.executor.ExecutorType;
93  import org.apache.hadoop.hbase.fs.HFileSystem;
94  import org.apache.hadoop.hbase.http.InfoServer;
95  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
96  import org.apache.hadoop.hbase.ipc.RpcClient;
97  import org.apache.hadoop.hbase.ipc.RpcClientFactory;
98  import org.apache.hadoop.hbase.ipc.RpcServerInterface;
99  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
100 import org.apache.hadoop.hbase.ipc.ServerRpcController;
101 import org.apache.hadoop.hbase.master.HMaster;
102 import org.apache.hadoop.hbase.master.RegionState.State;
103 import org.apache.hadoop.hbase.master.TableLockManager;
104 import org.apache.hadoop.hbase.procedure.RegionServerProcedureManagerHost;
105 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
106 import org.apache.hadoop.hbase.protobuf.RequestConverter;
107 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
108 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
109 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
110 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
111 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos;
112 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
113 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.Coprocessor;
114 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
115 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionServerInfo;
116 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier;
117 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier.RegionSpecifierType;
118 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
119 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
120 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStatusService;
123 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
124 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
125 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorRequest;
126 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
128 import org.apache.hadoop.hbase.regionserver.compactions.CompactionProgress;
129 import org.apache.hadoop.hbase.regionserver.handler.CloseMetaHandler;
130 import org.apache.hadoop.hbase.regionserver.handler.CloseRegionHandler;
131 import org.apache.hadoop.hbase.wal.DefaultWALProvider;
132 import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL;
133 import org.apache.hadoop.hbase.wal.WAL;
134 import org.apache.hadoop.hbase.wal.WALFactory;
135 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
136 import org.apache.hadoop.hbase.replication.regionserver.ReplicationLoad;
137 import org.apache.hadoop.hbase.security.UserProvider;
138 import org.apache.hadoop.hbase.trace.SpanReceiverHost;
139 import org.apache.hadoop.hbase.util.Addressing;
140 import org.apache.hadoop.hbase.util.ByteStringer;
141 import org.apache.hadoop.hbase.util.Bytes;
142 import org.apache.hadoop.hbase.util.CompressionTest;
143 import org.apache.hadoop.hbase.util.ConfigUtil;
144 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
145 import org.apache.hadoop.hbase.util.FSTableDescriptors;
146 import org.apache.hadoop.hbase.util.FSUtils;
147 import org.apache.hadoop.hbase.util.HasThread;
148 import org.apache.hadoop.hbase.util.JSONBean;
149 import org.apache.hadoop.hbase.util.JvmPauseMonitor;
150 import org.apache.hadoop.hbase.util.Sleeper;
151 import org.apache.hadoop.hbase.util.Threads;
152 import org.apache.hadoop.hbase.util.VersionInfo;
153 import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
154 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
155 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
156 import org.apache.hadoop.hbase.zookeeper.RecoveringRegionWatcher;
157 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
158 import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
159 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
160 import org.apache.hadoop.hbase.zookeeper.ZooKeeperNodeTracker;
161 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
162 import org.apache.hadoop.ipc.RemoteException;
163 import org.apache.hadoop.metrics.util.MBeanUtil;
164 import org.apache.hadoop.util.ReflectionUtils;
165 import org.apache.hadoop.util.StringUtils;
166 import org.apache.zookeeper.KeeperException;
167 import org.apache.zookeeper.KeeperException.NoNodeException;
168 import org.apache.zookeeper.data.Stat;
169 
170 import com.google.common.annotations.VisibleForTesting;
171 import com.google.common.collect.Maps;
172 import com.google.protobuf.BlockingRpcChannel;
173 import com.google.protobuf.Descriptors;
174 import com.google.protobuf.Message;
175 import com.google.protobuf.RpcCallback;
176 import com.google.protobuf.RpcController;
177 import com.google.protobuf.Service;
178 import com.google.protobuf.ServiceException;
179 
180 /**
181  * HRegionServer makes a set of HRegions available to clients. It checks in with
182  * the HMaster. There are many HRegionServers in a single HBase deployment.
183  */
184 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
185 @SuppressWarnings("deprecation")
186 public class HRegionServer extends HasThread implements
187     RegionServerServices, LastSequenceId {
188 
189   public static final Log LOG = LogFactory.getLog(HRegionServer.class);
190 
191   /*
192    * Strings to be used in forming the exception message for
193    * RegionsAlreadyInTransitionException.
194    */
195   protected static final String OPEN = "OPEN";
196   protected static final String CLOSE = "CLOSE";
197 
198   //RegionName vs current action in progress
199   //true - if open region action in progress
200   //false - if close region action in progress
201   protected final ConcurrentMap<byte[], Boolean> regionsInTransitionInRS =
202     new ConcurrentSkipListMap<byte[], Boolean>(Bytes.BYTES_COMPARATOR);
203 
204   // Cache flushing
205   protected MemStoreFlusher cacheFlusher;
206 
207   protected HeapMemoryManager hMemManager;
208 
209   /**
210    * Cluster connection to be shared by services.
211    * Initialized at server startup and closed when server shuts down.
212    * Clients must never close it explicitly.
213    */
214   protected ClusterConnection clusterConnection;
215 
216   /*
217    * Long-living meta table locator, which is created when the server is started and stopped
218    * when server shuts down. References to this locator shall be used to perform according
219    * operations in EventHandlers. Primary reason for this decision is to make it mockable
220    * for tests.
221    */
222   protected MetaTableLocator metaTableLocator;
223 
224   // Watch if a region is out of recovering state from ZooKeeper
225   @SuppressWarnings("unused")
226   private RecoveringRegionWatcher recoveringRegionWatcher;
227 
228   /**
229    * Go here to get table descriptors.
230    */
231   protected TableDescriptors tableDescriptors;
232 
233   // Replication services. If no replication, this handler will be null.
234   protected ReplicationSourceService replicationSourceHandler;
235   protected ReplicationSinkService replicationSinkHandler;
236 
237   // Compactions
238   public CompactSplitThread compactSplitThread;
239 
240   /**
241    * Map of regions currently being served by this region server. Key is the
242    * encoded region name.  All access should be synchronized.
243    */
244   protected final Map<String, HRegion> onlineRegions =
245     new ConcurrentHashMap<String, HRegion>();
246 
247   /**
248    * Map of encoded region names to the DataNode locations they should be hosted on
249    * We store the value as InetSocketAddress since this is used only in HDFS
250    * API (create() that takes favored nodes as hints for placing file blocks).
251    * We could have used ServerName here as the value class, but we'd need to
252    * convert it to InetSocketAddress at some point before the HDFS API call, and
253    * it seems a bit weird to store ServerName since ServerName refers to RegionServers
254    * and here we really mean DataNode locations.
255    */
256   protected final Map<String, InetSocketAddress[]> regionFavoredNodesMap =
257       new ConcurrentHashMap<String, InetSocketAddress[]>();
258 
259   /**
260    * Set of regions currently being in recovering state which means it can accept writes(edits from
261    * previous failed region server) but not reads. A recovering region is also an online region.
262    */
263   protected final Map<String, HRegion> recoveringRegions = Collections
264       .synchronizedMap(new HashMap<String, HRegion>());
265 
266   // Leases
267   protected Leases leases;
268 
269   // Instance of the hbase executor service.
270   protected ExecutorService service;
271 
272   // If false, the file system has become unavailable
273   protected volatile boolean fsOk;
274   protected HFileSystem fs;
275 
276   // Set when a report to the master comes back with a message asking us to
277   // shutdown. Also set by call to stop when debugging or running unit tests
278   // of HRegionServer in isolation.
279   private volatile boolean stopped = false;
280 
281   // Go down hard. Used if file system becomes unavailable and also in
282   // debugging and unit tests.
283   private volatile boolean abortRequested;
284 
285   ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<String, Integer>();
286 
287   // A state before we go into stopped state.  At this stage we're closing user
288   // space regions.
289   private boolean stopping = false;
290 
291   private volatile boolean killed = false;
292 
293   protected final Configuration conf;
294 
295   private Path rootDir;
296 
297   protected final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
298 
299   final int numRetries;
300   protected final int threadWakeFrequency;
301   protected final int msgInterval;
302 
303   protected final int numRegionsToReport;
304 
305   // Stub to do region server status calls against the master.
306   private volatile RegionServerStatusService.BlockingInterface rssStub;
307   // RPC client. Used to make the stub above that does region server status checking.
308   RpcClient rpcClient;
309 
310   private UncaughtExceptionHandler uncaughtExceptionHandler;
311 
312   // Info server. Default access so can be used by unit tests. REGIONSERVER
313   // is name of the webapp and the attribute name used stuffing this instance
314   // into web context.
315   protected InfoServer infoServer;
316   private JvmPauseMonitor pauseMonitor;
317 
318   /** region server process name */
319   public static final String REGIONSERVER = "regionserver";
320 
321   MetricsRegionServer metricsRegionServer;
322   private SpanReceiverHost spanReceiverHost;
323 
324   /*
325    * Check for compactions requests.
326    */
327   Chore compactionChecker;
328 
329   /*
330    * Check for flushes
331    */
332   Chore periodicFlusher;
333 
334   protected volatile WALFactory walFactory;
335 
336   // WAL roller. log is protected rather than private to avoid
337   // eclipse warning when accessed by inner classes
338   final LogRoller walRoller;
339   // Lazily initialized if this RegionServer hosts a meta table.
340   final AtomicReference<LogRoller> metawalRoller = new AtomicReference<LogRoller>();
341 
342   // flag set after we're done setting up server threads
343   final AtomicBoolean online = new AtomicBoolean(false);
344 
345   // zookeeper connection and watcher
346   protected ZooKeeperWatcher zooKeeper;
347 
348   // master address tracker
349   private MasterAddressTracker masterAddressTracker;
350 
351   // Cluster Status Tracker
352   protected ClusterStatusTracker clusterStatusTracker;
353 
354   // Log Splitting Worker
355   private SplitLogWorker splitLogWorker;
356 
357   // A sleeper that sleeps for msgInterval.
358   protected final Sleeper sleeper;
359 
360   private final int operationTimeout;
361 
362   private final RegionServerAccounting regionServerAccounting;
363 
364   // Cache configuration and block cache reference
365   protected CacheConfig cacheConfig;
366 
367   /** The health check chore. */
368   private HealthCheckChore healthCheckChore;
369 
370   /** The nonce manager chore. */
371   private Chore nonceManagerChore;
372 
373   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
374 
375   /**
376    * The server name the Master sees us as.  Its made from the hostname the
377    * master passes us, port, and server startcode. Gets set after registration
378    * against  Master.
379    */
380   protected ServerName serverName;
381 
382   /**
383    * This servers startcode.
384    */
385   protected final long startcode;
386 
387   /**
388    * Unique identifier for the cluster we are a part of.
389    */
390   private String clusterId;
391 
392   /**
393    * MX Bean for RegionServerInfo
394    */
395   private ObjectName mxBean = null;
396 
397   /**
398    * Chore to clean periodically the moved region list
399    */
400   private MovedRegionsCleaner movedRegionsCleaner;
401 
402   // chore for refreshing store files for secondary regions
403   private StorefileRefresherChore storefileRefresher;
404 
405   private RegionServerCoprocessorHost rsHost;
406 
407   private RegionServerProcedureManagerHost rspmHost;
408 
409   // Table level lock manager for locking for region operations
410   protected TableLockManager tableLockManager;
411 
412   /**
413    * Nonce manager. Nonces are used to make operations like increment and append idempotent
414    * in the case where client doesn't receive the response from a successful operation and
415    * retries. We track the successful ops for some time via a nonce sent by client and handle
416    * duplicate operations (currently, by failing them; in future we might use MVCC to return
417    * result). Nonces are also recovered from WAL during, recovery; however, the caveats (from
418    * HBASE-3787) are:
419    * - WAL recovery is optimized, and under high load we won't read nearly nonce-timeout worth
420    *   of past records. If we don't read the records, we don't read and recover the nonces.
421    *   Some WALs within nonce-timeout at recovery may not even be present due to rolling/cleanup.
422    * - There's no WAL recovery during normal region move, so nonces will not be transfered.
423    * We can have separate additional "Nonce WAL". It will just contain bunch of numbers and
424    * won't be flushed on main path - because WAL itself also contains nonces, if we only flush
425    * it before memstore flush, for a given nonce we will either see it in the WAL (if it was
426    * never flushed to disk, it will be part of recovery), or we'll see it as part of the nonce
427    * log (or both occasionally, which doesn't matter). Nonce log file can be deleted after the
428    * latest nonce in it expired. It can also be recovered during move.
429    */
430   final ServerNonceManager nonceManager;
431 
432   private UserProvider userProvider;
433 
434   protected final RSRpcServices rpcServices;
435 
436   protected BaseCoordinatedStateManager csm;
437 
438   private final boolean useZKForAssignment;
439 
440   /**
441    * Configuration manager is used to register/deregister and notify the configuration observers
442    * when the regionserver is notified that there was a change in the on disk configs.
443    */
444   protected final ConfigurationManager configurationManager;
445 
446   /**
447    * Starts a HRegionServer at the default location.
448    * @param conf
449    * @throws IOException
450    * @throws InterruptedException
451    */
452   public HRegionServer(Configuration conf) throws IOException, InterruptedException {
453     this(conf, CoordinatedStateManagerFactory.getCoordinatedStateManager(conf));
454   }
455 
456   /**
457    * Starts a HRegionServer at the default location
458    * @param conf
459    * @param csm implementation of CoordinatedStateManager to be used
460    * @throws IOException
461    * @throws InterruptedException
462    */
463   public HRegionServer(Configuration conf, CoordinatedStateManager csm)
464       throws IOException, InterruptedException {
465     this.fsOk = true;
466     this.conf = conf;
467     checkCodecs(this.conf);
468     this.userProvider = UserProvider.instantiate(conf);
469     FSUtils.setupShortCircuitRead(this.conf);
470 
471     // Config'ed params
472     this.numRetries = this.conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
473         HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
474     this.threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
475     this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
476 
477     this.sleeper = new Sleeper(this.msgInterval, this);
478 
479     boolean isNoncesEnabled = conf.getBoolean(HConstants.HBASE_RS_NONCES_ENABLED, true);
480     this.nonceManager = isNoncesEnabled ? new ServerNonceManager(this.conf) : null;
481 
482     this.numRegionsToReport = conf.getInt(
483       "hbase.regionserver.numregionstoreport", 10);
484 
485     this.operationTimeout = conf.getInt(
486       HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY,
487       HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT);
488 
489     this.abortRequested = false;
490     this.stopped = false;
491 
492     rpcServices = createRpcServices();
493     this.startcode = System.currentTimeMillis();
494     String hostName = rpcServices.isa.getHostName();
495     serverName = ServerName.valueOf(hostName, rpcServices.isa.getPort(), startcode);
496 
497     // login the zookeeper client principal (if using security)
498     ZKUtil.loginClient(this.conf, "hbase.zookeeper.client.keytab.file",
499       "hbase.zookeeper.client.kerberos.principal", hostName);
500     // login the server principal (if using secure Hadoop)
501     login(userProvider, hostName);
502 
503     regionServerAccounting = new RegionServerAccounting();
504     uncaughtExceptionHandler = new UncaughtExceptionHandler() {
505       @Override
506       public void uncaughtException(Thread t, Throwable e) {
507         abort("Uncaught exception in service thread " + t.getName(), e);
508       }
509     };
510 
511     useZKForAssignment = ConfigUtil.useZKForAssignment(conf);
512 
513     // Set 'fs.defaultFS' to match the filesystem on hbase.rootdir else
514     // underlying hadoop hdfs accessors will be going against wrong filesystem
515     // (unless all is set to defaults).
516     FSUtils.setFsDefault(this.conf, FSUtils.getRootDir(this.conf));
517     // Get fs instance used by this RS.  Do we use checksum verification in the hbase? If hbase
518     // checksum verification enabled, then automatically switch off hdfs checksum verification.
519     boolean useHBaseChecksum = conf.getBoolean(HConstants.HBASE_CHECKSUM_VERIFICATION, true);
520     this.fs = new HFileSystem(this.conf, useHBaseChecksum);
521     this.rootDir = FSUtils.getRootDir(this.conf);
522     this.tableDescriptors = new FSTableDescriptors(
523       this.conf, this.fs, this.rootDir, !canUpdateTableDescriptor(), false);
524 
525     service = new ExecutorService(getServerName().toShortString());
526     spanReceiverHost = SpanReceiverHost.getInstance(getConfiguration());
527 
528     // Some unit tests don't need a cluster, so no zookeeper at all
529     if (!conf.getBoolean("hbase.testing.nocluster", false)) {
530       // Open connection to zookeeper and set primary watcher
531       zooKeeper = new ZooKeeperWatcher(conf, getProcessName() + ":" +
532         rpcServices.isa.getPort(), this, canCreateBaseZNode());
533 
534       this.csm = (BaseCoordinatedStateManager) csm;
535       this.csm.initialize(this);
536       this.csm.start();
537 
538       tableLockManager = TableLockManager.createTableLockManager(
539         conf, zooKeeper, serverName);
540 
541       masterAddressTracker = new MasterAddressTracker(getZooKeeper(), this);
542       masterAddressTracker.start();
543 
544       clusterStatusTracker = new ClusterStatusTracker(zooKeeper, this);
545       clusterStatusTracker.start();
546     }
547     this.configurationManager = new ConfigurationManager();
548 
549     rpcServices.start();
550     putUpWebUI();
551     this.walRoller = new LogRoller(this, this);
552   }
553 
554   protected void login(UserProvider user, String host) throws IOException {
555     user.login("hbase.regionserver.keytab.file",
556       "hbase.regionserver.kerberos.principal", host);
557   }
558 
559   protected void waitForMasterActive(){
560   }
561 
562   protected String getProcessName() {
563     return REGIONSERVER;
564   }
565 
566   protected boolean canCreateBaseZNode() {
567     return false;
568   }
569 
570   protected boolean canUpdateTableDescriptor() {
571     return false;
572   }
573 
574   protected RSRpcServices createRpcServices() throws IOException {
575     return new RSRpcServices(this);
576   }
577 
578   protected void configureInfoServer() {
579     infoServer.addServlet("rs-status", "/rs-status", RSStatusServlet.class);
580     infoServer.setAttribute(REGIONSERVER, this);
581   }
582 
583   protected Class<? extends HttpServlet> getDumpServlet() {
584     return RSDumpServlet.class;
585   }
586 
587   protected void doMetrics() {
588   }
589 
590   @Override
591   public boolean registerService(Service instance) {
592     /*
593      * No stacking of instances is allowed for a single service name
594      */
595     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
596     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
597       LOG.error("Coprocessor service " + serviceDesc.getFullName()
598           + " already registered, rejecting request from " + instance);
599       return false;
600     }
601 
602     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
603     if (LOG.isDebugEnabled()) {
604       LOG.debug("Registered regionserver coprocessor service: service=" + serviceDesc.getFullName());
605     }
606     return true;
607   }
608 
609   /**
610    * Create a 'smarter' HConnection, one that is capable of by-passing RPC if the request is to
611    * the local server.  Safe to use going to local or remote server.
612    * Create this instance in a method can be intercepted and mocked in tests.
613    * @throws IOException
614    */
615   @VisibleForTesting
616   protected ClusterConnection createClusterConnection() throws IOException {
617     // Create a cluster connection that when appropriate, can short-circuit and go directly to the
618     // local server if the request is to the local server bypassing RPC. Can be used for both local
619     // and remote invocations.
620     return ConnectionUtils.createShortCircuitHConnection(
621       ConnectionFactory.createConnection(conf), serverName, rpcServices, rpcServices);
622   }
623 
624   /**
625    * Run test on configured codecs to make sure supporting libs are in place.
626    * @param c
627    * @throws IOException
628    */
629   private static void checkCodecs(final Configuration c) throws IOException {
630     // check to see if the codec list is available:
631     String [] codecs = c.getStrings("hbase.regionserver.codecs", (String[])null);
632     if (codecs == null) return;
633     for (String codec : codecs) {
634       if (!CompressionTest.testCompression(codec)) {
635         throw new IOException("Compression codec " + codec +
636           " not supported, aborting RS construction");
637       }
638     }
639   }
640 
641   public String getClusterId() {
642     return this.clusterId;
643   }
644 
645   /**
646    * Setup our cluster connection if not already initialized.
647    * @throws IOException
648    */
649   protected synchronized void setupClusterConnection() throws IOException {
650     if (clusterConnection == null) {
651       clusterConnection = createClusterConnection();
652       metaTableLocator = new MetaTableLocator();
653     }
654   }
655 
656   /**
657    * All initialization needed before we go register with Master.
658    *
659    * @throws IOException
660    * @throws InterruptedException
661    */
662   private void preRegistrationInitialization(){
663     try {
664       setupClusterConnection();
665 
666       // Health checker thread.
667       if (isHealthCheckerConfigured()) {
668         int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
669           HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
670         healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
671       }
672       this.pauseMonitor = new JvmPauseMonitor(conf);
673       pauseMonitor.start();
674 
675       initializeZooKeeper();
676       if (!isStopped() && !isAborted()) {
677         initializeThreads();
678       }
679     } catch (Throwable t) {
680       // Call stop if error or process will stick around for ever since server
681       // puts up non-daemon threads.
682       this.rpcServices.stop();
683       abort("Initialization of RS failed.  Hence aborting RS.", t);
684     }
685   }
686 
687   /**
688    * Bring up connection to zk ensemble and then wait until a master for this
689    * cluster and then after that, wait until cluster 'up' flag has been set.
690    * This is the order in which master does things.
691    * Finally open long-living server short-circuit connection.
692    * @throws IOException
693    * @throws InterruptedException
694    */
695   private void initializeZooKeeper() throws IOException, InterruptedException {
696     // Create the master address tracker, register with zk, and start it.  Then
697     // block until a master is available.  No point in starting up if no master
698     // running.
699     blockAndCheckIfStopped(this.masterAddressTracker);
700 
701     // Wait on cluster being up.  Master will set this flag up in zookeeper
702     // when ready.
703     blockAndCheckIfStopped(this.clusterStatusTracker);
704 
705     // Retrieve clusterId
706     // Since cluster status is now up
707     // ID should have already been set by HMaster
708     try {
709       clusterId = ZKClusterId.readClusterIdZNode(this.zooKeeper);
710       if (clusterId == null) {
711         this.abort("Cluster ID has not been set");
712       }
713       LOG.info("ClusterId : "+clusterId);
714     } catch (KeeperException e) {
715       this.abort("Failed to retrieve Cluster ID",e);
716     }
717 
718     // In case colocated master, wait here till it's active.
719     // So backup masters won't start as regionservers.
720     // This is to avoid showing backup masters as regionservers
721     // in master web UI, or assigning any region to them.
722     waitForMasterActive();
723     if (isStopped() || isAborted()) {
724       return; // No need for further initialization
725     }
726 
727     // watch for snapshots and other procedures
728     try {
729       rspmHost = new RegionServerProcedureManagerHost();
730       rspmHost.loadProcedures(conf);
731       rspmHost.initialize(this);
732     } catch (KeeperException e) {
733       this.abort("Failed to reach zk cluster when creating procedure handler.", e);
734     }
735     // register watcher for recovering regions
736     this.recoveringRegionWatcher = new RecoveringRegionWatcher(this.zooKeeper, this);
737   }
738 
739   /**
740    * Utilty method to wait indefinitely on a znode availability while checking
741    * if the region server is shut down
742    * @param tracker znode tracker to use
743    * @throws IOException any IO exception, plus if the RS is stopped
744    * @throws InterruptedException
745    */
746   private void blockAndCheckIfStopped(ZooKeeperNodeTracker tracker)
747       throws IOException, InterruptedException {
748     while (tracker.blockUntilAvailable(this.msgInterval, false) == null) {
749       if (this.stopped) {
750         throw new IOException("Received the shutdown message while waiting.");
751       }
752     }
753   }
754 
755   /**
756    * @return False if cluster shutdown in progress
757    */
758   private boolean isClusterUp() {
759     return clusterStatusTracker != null && clusterStatusTracker.isClusterUp();
760   }
761 
762   private void initializeThreads() throws IOException {
763     // Cache flushing thread.
764     this.cacheFlusher = new MemStoreFlusher(conf, this);
765 
766     // Compaction thread
767     this.compactSplitThread = new CompactSplitThread(this);
768 
769     // Background thread to check for compactions; needed if region has not gotten updates
770     // in a while. It will take care of not checking too frequently on store-by-store basis.
771     this.compactionChecker = new CompactionChecker(this, this.threadWakeFrequency, this);
772     this.periodicFlusher = new PeriodicMemstoreFlusher(this.threadWakeFrequency, this);
773     this.leases = new Leases(this.threadWakeFrequency);
774 
775     // Create the thread to clean the moved regions list
776     movedRegionsCleaner = MovedRegionsCleaner.createAndStart(this);
777 
778     if (this.nonceManager != null) {
779       // Create the chore that cleans up nonces.
780       nonceManagerChore = this.nonceManager.createCleanupChore(this);
781     }
782 
783     // Setup RPC client for master communication
784     rpcClient = RpcClientFactory.createClient(conf, clusterId, new InetSocketAddress(
785         rpcServices.isa.getAddress(), 0));
786 
787     int storefileRefreshPeriod = conf.getInt(
788         StorefileRefresherChore.REGIONSERVER_STOREFILE_REFRESH_PERIOD
789       , StorefileRefresherChore.DEFAULT_REGIONSERVER_STOREFILE_REFRESH_PERIOD);
790     if (storefileRefreshPeriod > 0) {
791       this.storefileRefresher = new StorefileRefresherChore(storefileRefreshPeriod, this, this);
792     }
793     registerConfigurationObservers();
794   }
795 
796   private void registerConfigurationObservers() {
797     // Registering the compactSplitThread object with the ConfigurationManager.
798     configurationManager.registerObserver(this.compactSplitThread);
799   }
800 
801   /**
802    * The HRegionServer sticks in this loop until closed.
803    */
804   @Override
805   public void run() {
806     try {
807       // Do pre-registration initializations; zookeeper, lease threads, etc.
808       preRegistrationInitialization();
809     } catch (Throwable e) {
810       abort("Fatal exception during initialization", e);
811     }
812 
813     try {
814       if (!isStopped() && !isAborted()) {
815         ShutdownHook.install(conf, fs, this, Thread.currentThread());
816         // Set our ephemeral znode up in zookeeper now we have a name.
817         createMyEphemeralNode();
818         // Initialize the RegionServerCoprocessorHost now that our ephemeral
819         // node was created, in case any coprocessors want to use ZooKeeper
820         this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
821       }
822 
823       // Try and register with the Master; tell it we are here.  Break if
824       // server is stopped or the clusterup flag is down or hdfs went wacky.
825       while (keepLooping()) {
826         RegionServerStartupResponse w = reportForDuty();
827         if (w == null) {
828           LOG.warn("reportForDuty failed; sleeping and then retrying.");
829           this.sleeper.sleep();
830         } else {
831           handleReportForDutyResponse(w);
832           break;
833         }
834       }
835 
836       if (!isStopped() && isHealthy()){
837         // start the snapshot handler and other procedure handlers,
838         // since the server is ready to run
839         rspmHost.start();
840       }
841 
842       // We registered with the Master.  Go into run mode.
843       long lastMsg = System.currentTimeMillis();
844       long oldRequestCount = -1;
845       // The main run loop.
846       while (!isStopped() && isHealthy()) {
847         if (!isClusterUp()) {
848           if (isOnlineRegionsEmpty()) {
849             stop("Exiting; cluster shutdown set and not carrying any regions");
850           } else if (!this.stopping) {
851             this.stopping = true;
852             LOG.info("Closing user regions");
853             closeUserRegions(this.abortRequested);
854           } else if (this.stopping) {
855             boolean allUserRegionsOffline = areAllUserRegionsOffline();
856             if (allUserRegionsOffline) {
857               // Set stopped if no more write requests tp meta tables
858               // since last time we went around the loop.  Any open
859               // meta regions will be closed on our way out.
860               if (oldRequestCount == getWriteRequestCount()) {
861                 stop("Stopped; only catalog regions remaining online");
862                 break;
863               }
864               oldRequestCount = getWriteRequestCount();
865             } else {
866               // Make sure all regions have been closed -- some regions may
867               // have not got it because we were splitting at the time of
868               // the call to closeUserRegions.
869               closeUserRegions(this.abortRequested);
870             }
871             LOG.debug("Waiting on " + getOnlineRegionsAsPrintableString());
872           }
873         }
874         long now = System.currentTimeMillis();
875         if ((now - lastMsg) >= msgInterval) {
876           tryRegionServerReport(lastMsg, now);
877           lastMsg = System.currentTimeMillis();
878           doMetrics();
879         }
880         if (!isStopped() && !isAborted()) {
881           this.sleeper.sleep();
882         }
883       } // for
884     } catch (Throwable t) {
885       if (!rpcServices.checkOOME(t)) {
886         String prefix = t instanceof YouAreDeadException? "": "Unhandled: ";
887         abort(prefix + t.getMessage(), t);
888       }
889     }
890     // Run shutdown.
891     if (mxBean != null) {
892       MBeanUtil.unregisterMBean(mxBean);
893       mxBean = null;
894     }
895     if (this.leases != null) this.leases.closeAfterLeasesExpire();
896     if (this.splitLogWorker != null) {
897       splitLogWorker.stop();
898     }
899     if (this.infoServer != null) {
900       LOG.info("Stopping infoServer");
901       try {
902         this.infoServer.stop();
903       } catch (Exception e) {
904         LOG.error("Failed to stop infoServer", e);
905       }
906     }
907     // Send cache a shutdown.
908     if (cacheConfig != null && cacheConfig.isBlockCacheEnabled()) {
909       cacheConfig.getBlockCache().shutdown();
910     }
911 
912     if (movedRegionsCleaner != null) {
913       movedRegionsCleaner.stop("Region Server stopping");
914     }
915 
916     // Send interrupts to wake up threads if sleeping so they notice shutdown.
917     // TODO: Should we check they are alive? If OOME could have exited already
918     if(this.hMemManager != null) this.hMemManager.stop();
919     if (this.cacheFlusher != null) this.cacheFlusher.interruptIfNecessary();
920     if (this.compactSplitThread != null) this.compactSplitThread.interruptIfNecessary();
921     if (this.compactionChecker != null)
922       this.compactionChecker.interrupt();
923     if (this.healthCheckChore != null) {
924       this.healthCheckChore.interrupt();
925     }
926     if (this.nonceManagerChore != null) {
927       this.nonceManagerChore.interrupt();
928     }
929     if (this.storefileRefresher != null) {
930       this.storefileRefresher.interrupt();
931     }
932 
933     // Stop the snapshot and other procedure handlers, forcefully killing all running tasks
934     if (rspmHost != null) {
935       rspmHost.stop(this.abortRequested || this.killed);
936     }
937 
938     if (this.killed) {
939       // Just skip out w/o closing regions.  Used when testing.
940     } else if (abortRequested) {
941       if (this.fsOk) {
942         closeUserRegions(abortRequested); // Don't leave any open file handles
943       }
944       LOG.info("aborting server " + this.serverName);
945     } else {
946       closeUserRegions(abortRequested);
947       LOG.info("stopping server " + this.serverName);
948     }
949 
950     // so callers waiting for meta without timeout can stop
951     if (this.metaTableLocator != null) this.metaTableLocator.stop();
952     if (this.clusterConnection != null && !clusterConnection.isClosed()) {
953       try {
954         this.clusterConnection.close();
955       } catch (IOException e) {
956         // Although the {@link Closeable} interface throws an {@link
957         // IOException}, in reality, the implementation would never do that.
958         LOG.warn("Attempt to close server's short circuit HConnection failed.", e);
959       }
960     }
961 
962     // Closing the compactSplit thread before closing meta regions
963     if (!this.killed && containsMetaTableRegions()) {
964       if (!abortRequested || this.fsOk) {
965         if (this.compactSplitThread != null) {
966           this.compactSplitThread.join();
967           this.compactSplitThread = null;
968         }
969         closeMetaTableRegions(abortRequested);
970       }
971     }
972 
973     if (!this.killed && this.fsOk) {
974       waitOnAllRegionsToClose(abortRequested);
975       LOG.info("stopping server " + this.serverName +
976         "; all regions closed.");
977     }
978 
979     //fsOk flag may be changed when closing regions throws exception.
980     if (this.fsOk) {
981       shutdownWAL(!abortRequested);
982     }
983 
984     // Make sure the proxy is down.
985     if (this.rssStub != null) {
986       this.rssStub = null;
987     }
988     if (this.rpcClient != null) {
989       this.rpcClient.close();
990     }
991     if (this.leases != null) {
992       this.leases.close();
993     }
994     if (this.pauseMonitor != null) {
995       this.pauseMonitor.stop();
996     }
997 
998     if (!killed) {
999       stopServiceThreads();
1000     }
1001 
1002     if (this.rpcServices != null) {
1003       this.rpcServices.stop();
1004     }
1005 
1006     try {
1007       deleteMyEphemeralNode();
1008     } catch (KeeperException.NoNodeException nn) {
1009     } catch (KeeperException e) {
1010       LOG.warn("Failed deleting my ephemeral node", e);
1011     }
1012     // We may have failed to delete the znode at the previous step, but
1013     //  we delete the file anyway: a second attempt to delete the znode is likely to fail again.
1014     ZNodeClearer.deleteMyEphemeralNodeOnDisk();
1015 
1016     if (this.zooKeeper != null) {
1017       this.zooKeeper.close();
1018     }
1019     LOG.info("stopping server " + this.serverName +
1020       "; zookeeper connection closed.");
1021 
1022     LOG.info(Thread.currentThread().getName() + " exiting");
1023   }
1024 
1025   private boolean containsMetaTableRegions() {
1026     return onlineRegions.containsKey(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
1027   }
1028 
1029   private boolean areAllUserRegionsOffline() {
1030     if (getNumberOfOnlineRegions() > 2) return false;
1031     boolean allUserRegionsOffline = true;
1032     for (Map.Entry<String, HRegion> e: this.onlineRegions.entrySet()) {
1033       if (!e.getValue().getRegionInfo().isMetaTable()) {
1034         allUserRegionsOffline = false;
1035         break;
1036       }
1037     }
1038     return allUserRegionsOffline;
1039   }
1040 
1041   /**
1042    * @return Current write count for all online regions.
1043    */
1044   private long getWriteRequestCount() {
1045     int writeCount = 0;
1046     for (Map.Entry<String, HRegion> e: this.onlineRegions.entrySet()) {
1047       writeCount += e.getValue().getWriteRequestsCount();
1048     }
1049     return writeCount;
1050   }
1051 
1052   @VisibleForTesting
1053   protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
1054   throws IOException {
1055     RegionServerStatusService.BlockingInterface rss = rssStub;
1056     if (rss == null) {
1057       // the current server could be stopping.
1058       return;
1059     }
1060     ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
1061     try {
1062       RegionServerReportRequest.Builder request = RegionServerReportRequest.newBuilder();
1063       ServerName sn = ServerName.parseVersionedServerName(
1064         this.serverName.getVersionedBytes());
1065       request.setServer(ProtobufUtil.toServerName(sn));
1066       request.setLoad(sl);
1067       rss.regionServerReport(null, request.build());
1068     } catch (ServiceException se) {
1069       IOException ioe = ProtobufUtil.getRemoteException(se);
1070       if (ioe instanceof YouAreDeadException) {
1071         // This will be caught and handled as a fatal error in run()
1072         throw ioe;
1073       }
1074       if (rssStub == rss) {
1075         rssStub = null;
1076       }
1077       // Couldn't connect to the master, get location from zk and reconnect
1078       // Method blocks until new master is found or we are stopped
1079       createRegionServerStatusStub();
1080     }
1081   }
1082 
1083   ClusterStatusProtos.ServerLoad buildServerLoad(long reportStartTime, long reportEndTime)
1084       throws IOException {
1085     // We're getting the MetricsRegionServerWrapper here because the wrapper computes requests
1086     // per second, and other metrics  As long as metrics are part of ServerLoad it's best to use
1087     // the wrapper to compute those numbers in one place.
1088     // In the long term most of these should be moved off of ServerLoad and the heart beat.
1089     // Instead they should be stored in an HBase table so that external visibility into HBase is
1090     // improved; Additionally the load balancer will be able to take advantage of a more complete
1091     // history.
1092     MetricsRegionServerWrapper regionServerWrapper = this.metricsRegionServer.getRegionServerWrapper();
1093     Collection<HRegion> regions = getOnlineRegionsLocalContext();
1094     MemoryUsage memory =
1095       ManagementFactory.getMemoryMXBean().getHeapMemoryUsage();
1096 
1097     ClusterStatusProtos.ServerLoad.Builder serverLoad =
1098       ClusterStatusProtos.ServerLoad.newBuilder();
1099     serverLoad.setNumberOfRequests((int) regionServerWrapper.getRequestsPerSecond());
1100     serverLoad.setTotalNumberOfRequests((int) regionServerWrapper.getTotalRequestCount());
1101     serverLoad.setUsedHeapMB((int)(memory.getUsed() / 1024 / 1024));
1102     serverLoad.setMaxHeapMB((int) (memory.getMax() / 1024 / 1024));
1103     Set<String> coprocessors = getWAL(null).getCoprocessorHost().getCoprocessors();
1104     for (String coprocessor : coprocessors) {
1105       serverLoad.addCoprocessors(
1106         Coprocessor.newBuilder().setName(coprocessor).build());
1107     }
1108     RegionLoad.Builder regionLoadBldr = RegionLoad.newBuilder();
1109     RegionSpecifier.Builder regionSpecifier = RegionSpecifier.newBuilder();
1110     for (HRegion region : regions) {
1111       serverLoad.addRegionLoads(createRegionLoad(region, regionLoadBldr, regionSpecifier));
1112       for (String coprocessor :
1113           getWAL(region.getRegionInfo()).getCoprocessorHost().getCoprocessors()) {
1114         serverLoad.addCoprocessors(Coprocessor.newBuilder().setName(coprocessor).build());
1115       }
1116     }
1117     serverLoad.setReportStartTime(reportStartTime);
1118     serverLoad.setReportEndTime(reportEndTime);
1119     if (this.infoServer != null) {
1120       serverLoad.setInfoServerPort(this.infoServer.getPort());
1121     } else {
1122       serverLoad.setInfoServerPort(-1);
1123     }
1124 
1125     // for the replicationLoad purpose. Only need to get from one service
1126     // either source or sink will get the same info
1127     ReplicationSourceService rsources = getReplicationSourceService();
1128 
1129     if (rsources != null) {
1130       // always refresh first to get the latest value
1131       ReplicationLoad rLoad = rsources.refreshAndGetReplicationLoad();
1132       if (rLoad != null) {
1133         serverLoad.setReplLoadSink(rLoad.getReplicationLoadSink());
1134         for (ClusterStatusProtos.ReplicationLoadSource rLS : rLoad.getReplicationLoadSourceList()) {
1135           serverLoad.addReplLoadSource(rLS);
1136         }
1137       }
1138     }
1139 
1140     return serverLoad.build();
1141   }
1142 
1143   String getOnlineRegionsAsPrintableString() {
1144     StringBuilder sb = new StringBuilder();
1145     for (HRegion r: this.onlineRegions.values()) {
1146       if (sb.length() > 0) sb.append(", ");
1147       sb.append(r.getRegionInfo().getEncodedName());
1148     }
1149     return sb.toString();
1150   }
1151 
1152   /**
1153    * Wait on regions close.
1154    */
1155   private void waitOnAllRegionsToClose(final boolean abort) {
1156     // Wait till all regions are closed before going out.
1157     int lastCount = -1;
1158     long previousLogTime = 0;
1159     Set<String> closedRegions = new HashSet<String>();
1160     boolean interrupted = false;
1161     try {
1162       while (!isOnlineRegionsEmpty()) {
1163         int count = getNumberOfOnlineRegions();
1164         // Only print a message if the count of regions has changed.
1165         if (count != lastCount) {
1166           // Log every second at most
1167           if (System.currentTimeMillis() > (previousLogTime + 1000)) {
1168             previousLogTime = System.currentTimeMillis();
1169             lastCount = count;
1170             LOG.info("Waiting on " + count + " regions to close");
1171             // Only print out regions still closing if a small number else will
1172             // swamp the log.
1173             if (count < 10 && LOG.isDebugEnabled()) {
1174               LOG.debug(this.onlineRegions);
1175             }
1176           }
1177         }
1178         // Ensure all user regions have been sent a close. Use this to
1179         // protect against the case where an open comes in after we start the
1180         // iterator of onlineRegions to close all user regions.
1181         for (Map.Entry<String, HRegion> e : this.onlineRegions.entrySet()) {
1182           HRegionInfo hri = e.getValue().getRegionInfo();
1183           if (!this.regionsInTransitionInRS.containsKey(hri.getEncodedNameAsBytes())
1184               && !closedRegions.contains(hri.getEncodedName())) {
1185             closedRegions.add(hri.getEncodedName());
1186             // Don't update zk with this close transition; pass false.
1187             closeRegionIgnoreErrors(hri, abort);
1188               }
1189         }
1190         // No regions in RIT, we could stop waiting now.
1191         if (this.regionsInTransitionInRS.isEmpty()) {
1192           if (!isOnlineRegionsEmpty()) {
1193             LOG.info("We were exiting though online regions are not empty," +
1194                 " because some regions failed closing");
1195           }
1196           break;
1197         }
1198         if (sleep(200)) {
1199           interrupted = true;
1200         }
1201       }
1202     } finally {
1203       if (interrupted) {
1204         Thread.currentThread().interrupt();
1205       }
1206     }
1207   }
1208 
1209   private boolean sleep(long millis) {
1210     boolean interrupted = false;
1211     try {
1212       Thread.sleep(millis);
1213     } catch (InterruptedException e) {
1214       LOG.warn("Interrupted while sleeping");
1215       interrupted = true;
1216     }
1217     return interrupted;
1218   }
1219 
1220   private void shutdownWAL(final boolean close) {
1221     if (this.walFactory != null) {
1222       try {
1223         if (close) {
1224           walFactory.close();
1225         } else {
1226           walFactory.shutdown();
1227         }
1228       } catch (Throwable e) {
1229         e = RemoteExceptionHandler.checkThrowable(e);
1230         LOG.error("Shutdown / close of WAL failed: " + e);
1231         LOG.debug("Shutdown / close exception details:", e);
1232       }
1233     }
1234   }
1235 
1236   /*
1237    * Run init. Sets up wal and starts up all server threads.
1238    *
1239    * @param c Extra configuration.
1240    */
1241   protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
1242   throws IOException {
1243     try {
1244       for (NameStringPair e : c.getMapEntriesList()) {
1245         String key = e.getName();
1246         // The hostname the master sees us as.
1247         if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) {
1248           String hostnameFromMasterPOV = e.getValue();
1249           this.serverName = ServerName.valueOf(hostnameFromMasterPOV,
1250             rpcServices.isa.getPort(), this.startcode);
1251           if (!hostnameFromMasterPOV.equals(rpcServices.isa.getHostName())) {
1252             LOG.info("Master passed us a different hostname to use; was=" +
1253               rpcServices.isa.getHostName() + ", but now=" + hostnameFromMasterPOV);
1254           }
1255           continue;
1256         }
1257         String value = e.getValue();
1258         if (LOG.isDebugEnabled()) {
1259           LOG.info("Config from master: " + key + "=" + value);
1260         }
1261         this.conf.set(key, value);
1262       }
1263 
1264       // hack! Maps DFSClient => RegionServer for logs.  HDFS made this
1265       // config param for task trackers, but we can piggyback off of it.
1266       if (this.conf.get("mapreduce.task.attempt.id") == null) {
1267         this.conf.set("mapreduce.task.attempt.id", "hb_rs_" +
1268           this.serverName.toString());
1269       }
1270 
1271       // Save it in a file, this will allow to see if we crash
1272       ZNodeClearer.writeMyEphemeralNodeOnDisk(getMyEphemeralNodePath());
1273 
1274       this.cacheConfig = new CacheConfig(conf);
1275       this.walFactory = setupWALAndReplication();
1276       // Init in here rather than in constructor after thread name has been set
1277       this.metricsRegionServer = new MetricsRegionServer(new MetricsRegionServerWrapperImpl(this));
1278 
1279       startServiceThreads();
1280       startHeapMemoryManager();
1281       LOG.info("Serving as " + this.serverName +
1282         ", RpcServer on " + rpcServices.isa +
1283         ", sessionid=0x" +
1284         Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
1285 
1286       // Wake up anyone waiting for this server to online
1287       synchronized (online) {
1288         online.set(true);
1289         online.notifyAll();
1290       }
1291     } catch (Throwable e) {
1292       stop("Failed initialization");
1293       throw convertThrowableToIOE(cleanup(e, "Failed init"),
1294           "Region server startup failed");
1295     } finally {
1296       sleeper.skipSleepCycle();
1297     }
1298   }
1299 
1300   private void startHeapMemoryManager() {
1301     this.hMemManager = HeapMemoryManager.create(this.conf, this.cacheFlusher, this);
1302     if (this.hMemManager != null) {
1303       this.hMemManager.start();
1304     }
1305   }
1306 
1307   private void createMyEphemeralNode() throws KeeperException, IOException {
1308     RegionServerInfo.Builder rsInfo = RegionServerInfo.newBuilder();
1309     rsInfo.setInfoPort(infoServer != null ? infoServer.getPort() : -1);
1310     byte[] data = ProtobufUtil.prependPBMagic(rsInfo.build().toByteArray());
1311     ZKUtil.createEphemeralNodeAndWatch(this.zooKeeper,
1312       getMyEphemeralNodePath(), data);
1313   }
1314 
1315   private void deleteMyEphemeralNode() throws KeeperException {
1316     ZKUtil.deleteNode(this.zooKeeper, getMyEphemeralNodePath());
1317   }
1318 
1319   @Override
1320   public RegionServerAccounting getRegionServerAccounting() {
1321     return regionServerAccounting;
1322   }
1323 
1324   @Override
1325   public TableLockManager getTableLockManager() {
1326     return tableLockManager;
1327   }
1328 
1329   /*
1330    * @param r Region to get RegionLoad for.
1331    * @param regionLoadBldr the RegionLoad.Builder, can be null
1332    * @param regionSpecifier the RegionSpecifier.Builder, can be null
1333    * @return RegionLoad instance.
1334    *
1335    * @throws IOException
1336    */
1337   private RegionLoad createRegionLoad(final HRegion r, RegionLoad.Builder regionLoadBldr,
1338       RegionSpecifier.Builder regionSpecifier) {
1339     byte[] name = r.getRegionName();
1340     int stores = 0;
1341     int storefiles = 0;
1342     int storeUncompressedSizeMB = 0;
1343     int storefileSizeMB = 0;
1344     int memstoreSizeMB = (int) (r.memstoreSize.get() / 1024 / 1024);
1345     int storefileIndexSizeMB = 0;
1346     int rootIndexSizeKB = 0;
1347     int totalStaticIndexSizeKB = 0;
1348     int totalStaticBloomSizeKB = 0;
1349     long totalCompactingKVs = 0;
1350     long currentCompactedKVs = 0;
1351     synchronized (r.stores) {
1352       stores += r.stores.size();
1353       for (Store store : r.stores.values()) {
1354         storefiles += store.getStorefilesCount();
1355         storeUncompressedSizeMB += (int) (store.getStoreSizeUncompressed()
1356             / 1024 / 1024);
1357         storefileSizeMB += (int) (store.getStorefilesSize() / 1024 / 1024);
1358         storefileIndexSizeMB += (int) (store.getStorefilesIndexSize() / 1024 / 1024);
1359         CompactionProgress progress = store.getCompactionProgress();
1360         if (progress != null) {
1361           totalCompactingKVs += progress.totalCompactingKVs;
1362           currentCompactedKVs += progress.currentCompactedKVs;
1363         }
1364 
1365         rootIndexSizeKB +=
1366             (int) (store.getStorefilesIndexSize() / 1024);
1367 
1368         totalStaticIndexSizeKB +=
1369           (int) (store.getTotalStaticIndexSize() / 1024);
1370 
1371         totalStaticBloomSizeKB +=
1372           (int) (store.getTotalStaticBloomSize() / 1024);
1373       }
1374     }
1375     float dataLocality =
1376         r.getHDFSBlocksDistribution().getBlockLocalityIndex(serverName.getHostname());
1377     if (regionLoadBldr == null) {
1378       regionLoadBldr = RegionLoad.newBuilder();
1379     }
1380     if (regionSpecifier == null) {
1381       regionSpecifier = RegionSpecifier.newBuilder();
1382     }
1383     regionSpecifier.setType(RegionSpecifierType.REGION_NAME);
1384     regionSpecifier.setValue(ByteStringer.wrap(name));
1385     regionLoadBldr.setRegionSpecifier(regionSpecifier.build())
1386       .setStores(stores)
1387       .setStorefiles(storefiles)
1388       .setStoreUncompressedSizeMB(storeUncompressedSizeMB)
1389       .setStorefileSizeMB(storefileSizeMB)
1390       .setMemstoreSizeMB(memstoreSizeMB)
1391       .setStorefileIndexSizeMB(storefileIndexSizeMB)
1392       .setRootIndexSizeKB(rootIndexSizeKB)
1393       .setTotalStaticIndexSizeKB(totalStaticIndexSizeKB)
1394       .setTotalStaticBloomSizeKB(totalStaticBloomSizeKB)
1395       .setReadRequestsCount(r.readRequestsCount.get())
1396       .setWriteRequestsCount(r.writeRequestsCount.get())
1397       .setTotalCompactingKVs(totalCompactingKVs)
1398       .setCurrentCompactedKVs(currentCompactedKVs)
1399       .setCompleteSequenceId(r.lastFlushSeqId)
1400       .setDataLocality(dataLocality);
1401 
1402     return regionLoadBldr.build();
1403   }
1404 
1405   /**
1406    * @param encodedRegionName
1407    * @return An instance of RegionLoad.
1408    */
1409   public RegionLoad createRegionLoad(final String encodedRegionName) {
1410     HRegion r = null;
1411     r = this.onlineRegions.get(encodedRegionName);
1412     return r != null ? createRegionLoad(r, null, null) : null;
1413   }
1414 
1415   /*
1416    * Inner class that runs on a long period checking if regions need compaction.
1417    */
1418   private static class CompactionChecker extends Chore {
1419     private final HRegionServer instance;
1420     private final int majorCompactPriority;
1421     private final static int DEFAULT_PRIORITY = Integer.MAX_VALUE;
1422     private long iteration = 0;
1423 
1424     CompactionChecker(final HRegionServer h, final int sleepTime,
1425         final Stoppable stopper) {
1426       super("CompactionChecker", sleepTime, h);
1427       this.instance = h;
1428       LOG.info(this.getName() + " runs every " + StringUtils.formatTime(sleepTime));
1429 
1430       /* MajorCompactPriority is configurable.
1431        * If not set, the compaction will use default priority.
1432        */
1433       this.majorCompactPriority = this.instance.conf.
1434         getInt("hbase.regionserver.compactionChecker.majorCompactPriority",
1435         DEFAULT_PRIORITY);
1436     }
1437 
1438     @Override
1439     protected void chore() {
1440       for (HRegion r : this.instance.onlineRegions.values()) {
1441         if (r == null)
1442           continue;
1443         for (Store s : r.getStores().values()) {
1444           try {
1445             long multiplier = s.getCompactionCheckMultiplier();
1446             assert multiplier > 0;
1447             if (iteration % multiplier != 0) continue;
1448             if (s.needsCompaction()) {
1449               // Queue a compaction. Will recognize if major is needed.
1450               this.instance.compactSplitThread.requestSystemCompaction(r, s, getName()
1451                   + " requests compaction");
1452             } else if (s.isMajorCompaction()) {
1453               if (majorCompactPriority == DEFAULT_PRIORITY
1454                   || majorCompactPriority > r.getCompactPriority()) {
1455                 this.instance.compactSplitThread.requestCompaction(r, s, getName()
1456                     + " requests major compaction; use default priority", null);
1457               } else {
1458                 this.instance.compactSplitThread.requestCompaction(r, s, getName()
1459                     + " requests major compaction; use configured priority",
1460                   this.majorCompactPriority, null);
1461               }
1462             }
1463           } catch (IOException e) {
1464             LOG.warn("Failed major compaction check on " + r, e);
1465           }
1466         }
1467       }
1468       iteration = (iteration == Long.MAX_VALUE) ? 0 : (iteration + 1);
1469     }
1470   }
1471 
1472   static class PeriodicMemstoreFlusher extends Chore {
1473     final HRegionServer server;
1474     final static int RANGE_OF_DELAY = 20000; //millisec
1475     final static int MIN_DELAY_TIME = 3000; //millisec
1476     public PeriodicMemstoreFlusher(int cacheFlushInterval, final HRegionServer server) {
1477       super(server.getServerName() + "-MemstoreFlusherChore", cacheFlushInterval, server);
1478       this.server = server;
1479     }
1480 
1481     @Override
1482     protected void chore() {
1483       for (HRegion r : this.server.onlineRegions.values()) {
1484         if (r == null)
1485           continue;
1486         if (r.shouldFlush()) {
1487           FlushRequester requester = server.getFlushRequester();
1488           if (requester != null) {
1489             long randomDelay = RandomUtils.nextInt(RANGE_OF_DELAY) + MIN_DELAY_TIME;
1490             LOG.info(getName() + " requesting flush for region " + r.getRegionNameAsString() +
1491                 " after a delay of " + randomDelay);
1492             //Throttle the flushes by putting a delay. If we don't throttle, and there
1493             //is a balanced write-load on the regions in a table, we might end up
1494             //overwhelming the filesystem with too many flushes at once.
1495             requester.requestDelayedFlush(r, randomDelay);
1496           }
1497         }
1498       }
1499     }
1500   }
1501 
1502   /**
1503    * Report the status of the server. A server is online once all the startup is
1504    * completed (setting up filesystem, starting service threads, etc.). This
1505    * method is designed mostly to be useful in tests.
1506    *
1507    * @return true if online, false if not.
1508    */
1509   public boolean isOnline() {
1510     return online.get();
1511   }
1512 
1513   /**
1514    * Setup WAL log and replication if enabled.
1515    * Replication setup is done in here because it wants to be hooked up to WAL.
1516    * @return A WAL instance.
1517    * @throws IOException
1518    */
1519   private WALFactory setupWALAndReplication() throws IOException {
1520     // TODO Replication make assumptions here based on the default filesystem impl
1521     final Path oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
1522     final String logName = DefaultWALProvider.getWALDirectoryName(this.serverName.toString());
1523 
1524     Path logdir = new Path(rootDir, logName);
1525     if (LOG.isDebugEnabled()) LOG.debug("logdir=" + logdir);
1526     if (this.fs.exists(logdir)) {
1527       throw new RegionServerRunningException("Region server has already " +
1528         "created directory at " + this.serverName.toString());
1529     }
1530 
1531     // Instantiate replication manager if replication enabled.  Pass it the
1532     // log directories.
1533     createNewReplicationInstance(conf, this, this.fs, logdir, oldLogDir);
1534 
1535     // listeners the wal factory will add to wals it creates.
1536     final List<WALActionsListener> listeners = new ArrayList<WALActionsListener>();
1537     listeners.add(new MetricsWAL());
1538     if (this.replicationSourceHandler != null &&
1539         this.replicationSourceHandler.getWALActionsListener() != null) {
1540       // Replication handler is an implementation of WALActionsListener.
1541       listeners.add(this.replicationSourceHandler.getWALActionsListener());
1542     }
1543 
1544     return new WALFactory(conf, listeners, serverName.toString());
1545   }
1546 
1547   /**
1548    * We initialize the roller for the wal that handles meta lazily
1549    * since we don't know if this regionserver will handle it. All calls to
1550    * this method return a reference to the that same roller. As newly referenced
1551    * meta regions are brought online, they will be offered to the roller for maintenance.
1552    * As a part of that registration process, the roller will add itself as a
1553    * listener on the wal.
1554    */
1555   protected LogRoller ensureMetaWALRoller() {
1556     // Using a tmp log roller to ensure metaLogRoller is alive once it is not
1557     // null
1558     LogRoller roller = metawalRoller.get();
1559     if (null == roller) {
1560       LogRoller tmpLogRoller = new LogRoller(this, this);
1561       String n = Thread.currentThread().getName();
1562       Threads.setDaemonThreadRunning(tmpLogRoller.getThread(),
1563           n + "-MetaLogRoller", uncaughtExceptionHandler);
1564       if (metawalRoller.compareAndSet(null, tmpLogRoller)) {
1565         roller = tmpLogRoller;
1566       } else {
1567         // Another thread won starting the roller
1568         Threads.shutdown(tmpLogRoller.getThread());
1569         roller = metawalRoller.get();
1570       }
1571     }
1572     return roller;
1573   }
1574 
1575   public MetricsRegionServer getRegionServerMetrics() {
1576     return this.metricsRegionServer;
1577   }
1578 
1579   /**
1580    * @return Master address tracker instance.
1581    */
1582   public MasterAddressTracker getMasterAddressTracker() {
1583     return this.masterAddressTracker;
1584   }
1585 
1586   /*
1587    * Start maintenance Threads, Server, Worker and lease checker threads.
1588    * Install an UncaughtExceptionHandler that calls abort of RegionServer if we
1589    * get an unhandled exception. We cannot set the handler on all threads.
1590    * Server's internal Listener thread is off limits. For Server, if an OOME, it
1591    * waits a while then retries. Meantime, a flush or a compaction that tries to
1592    * run should trigger same critical condition and the shutdown will run. On
1593    * its way out, this server will shut down Server. Leases are sort of
1594    * inbetween. It has an internal thread that while it inherits from Chore, it
1595    * keeps its own internal stop mechanism so needs to be stopped by this
1596    * hosting server. Worker logs the exception and exits.
1597    */
1598   private void startServiceThreads() throws IOException {
1599     // Start executor services
1600     this.service.startExecutorService(ExecutorType.RS_OPEN_REGION,
1601       conf.getInt("hbase.regionserver.executor.openregion.threads", 3));
1602     this.service.startExecutorService(ExecutorType.RS_OPEN_META,
1603       conf.getInt("hbase.regionserver.executor.openmeta.threads", 1));
1604     this.service.startExecutorService(ExecutorType.RS_CLOSE_REGION,
1605       conf.getInt("hbase.regionserver.executor.closeregion.threads", 3));
1606     this.service.startExecutorService(ExecutorType.RS_CLOSE_META,
1607       conf.getInt("hbase.regionserver.executor.closemeta.threads", 1));
1608     if (conf.getBoolean(StoreScanner.STORESCANNER_PARALLEL_SEEK_ENABLE, false)) {
1609       this.service.startExecutorService(ExecutorType.RS_PARALLEL_SEEK,
1610         conf.getInt("hbase.storescanner.parallel.seek.threads", 10));
1611     }
1612     this.service.startExecutorService(ExecutorType.RS_LOG_REPLAY_OPS, conf.getInt(
1613        "hbase.regionserver.wal.max.splitters", SplitLogWorkerCoordination.DEFAULT_MAX_SPLITTERS));
1614 
1615     Threads.setDaemonThreadRunning(this.walRoller.getThread(), getName() + ".logRoller",
1616         uncaughtExceptionHandler);
1617     this.cacheFlusher.start(uncaughtExceptionHandler);
1618     Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), getName() +
1619       ".compactionChecker", uncaughtExceptionHandler);
1620     Threads.setDaemonThreadRunning(this.periodicFlusher.getThread(), getName() +
1621         ".periodicFlusher", uncaughtExceptionHandler);
1622     if (this.healthCheckChore != null) {
1623       Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), getName() + ".healthChecker",
1624             uncaughtExceptionHandler);
1625     }
1626     if (this.nonceManagerChore != null) {
1627       Threads.setDaemonThreadRunning(this.nonceManagerChore.getThread(), getName() + ".nonceCleaner",
1628             uncaughtExceptionHandler);
1629     }
1630     if (this.storefileRefresher != null) {
1631       Threads.setDaemonThreadRunning(this.storefileRefresher.getThread(), getName() + ".storefileRefresher",
1632             uncaughtExceptionHandler);
1633     }
1634 
1635     // Leases is not a Thread. Internally it runs a daemon thread. If it gets
1636     // an unhandled exception, it will just exit.
1637     this.leases.setName(getName() + ".leaseChecker");
1638     this.leases.start();
1639 
1640     if (this.replicationSourceHandler == this.replicationSinkHandler &&
1641         this.replicationSourceHandler != null) {
1642       this.replicationSourceHandler.startReplicationService();
1643     } else {
1644       if (this.replicationSourceHandler != null) {
1645         this.replicationSourceHandler.startReplicationService();
1646       }
1647       if (this.replicationSinkHandler != null) {
1648         this.replicationSinkHandler.startReplicationService();
1649       }
1650     }
1651 
1652     // Create the log splitting worker and start it
1653     // set a smaller retries to fast fail otherwise splitlogworker could be blocked for
1654     // quite a while inside HConnection layer. The worker won't be available for other
1655     // tasks even after current task is preempted after a split task times out.
1656     Configuration sinkConf = HBaseConfiguration.create(conf);
1657     sinkConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
1658       conf.getInt("hbase.log.replay.retries.number", 8)); // 8 retries take about 23 seconds
1659     sinkConf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY,
1660       conf.getInt("hbase.log.replay.rpc.timeout", 30000)); // default 30 seconds
1661     sinkConf.setInt("hbase.client.serverside.retries.multiplier", 1);
1662     this.splitLogWorker = new SplitLogWorker(this, sinkConf, this, this, walFactory);
1663     splitLogWorker.start();
1664   }
1665 
1666   /**
1667    * Puts up the webui.
1668    * @return Returns final port -- maybe different from what we started with.
1669    * @throws IOException
1670    */
1671   private int putUpWebUI() throws IOException {
1672     int port = this.conf.getInt(HConstants.REGIONSERVER_INFO_PORT,
1673       HConstants.DEFAULT_REGIONSERVER_INFOPORT);
1674     // -1 is for disabling info server
1675     if (port < 0) return port;
1676     String addr = this.conf.get("hbase.regionserver.info.bindAddress", "0.0.0.0");
1677     if (!Addressing.isLocalAddress(InetAddress.getByName(addr))) {
1678       String msg =
1679           "Failed to start http info server. Address " + addr
1680               + " does not belong to this host. Correct configuration parameter: "
1681               + "hbase.regionserver.info.bindAddress";
1682       LOG.error(msg);
1683       throw new IOException(msg);
1684     }
1685     // check if auto port bind enabled
1686     boolean auto = this.conf.getBoolean(HConstants.REGIONSERVER_INFO_PORT_AUTO,
1687         false);
1688     while (true) {
1689       try {
1690         this.infoServer = new InfoServer(getProcessName(), addr, port, false, this.conf);
1691         infoServer.addServlet("dump", "/dump", getDumpServlet());
1692         configureInfoServer();
1693         this.infoServer.start();
1694         break;
1695       } catch (BindException e) {
1696         if (!auto) {
1697           // auto bind disabled throw BindException
1698           LOG.error("Failed binding http info server to port: " + port);
1699           throw e;
1700         }
1701         // auto bind enabled, try to use another port
1702         LOG.info("Failed binding http info server to port: " + port);
1703         port++;
1704       }
1705     }
1706     port = this.infoServer.getPort();
1707     conf.setInt(HConstants.REGIONSERVER_INFO_PORT, port);
1708     int masterInfoPort = conf.getInt(HConstants.MASTER_INFO_PORT,
1709       HConstants.DEFAULT_MASTER_INFOPORT);
1710     conf.setInt("hbase.master.info.port.orig", masterInfoPort);
1711     conf.setInt(HConstants.MASTER_INFO_PORT, port);
1712     return port;
1713   }
1714 
1715   /*
1716    * Verify that server is healthy
1717    */
1718   private boolean isHealthy() {
1719     if (!fsOk) {
1720       // File system problem
1721       return false;
1722     }
1723     // Verify that all threads are alive
1724     if (!(leases.isAlive()
1725         && cacheFlusher.isAlive() && walRoller.isAlive()
1726         && this.compactionChecker.isAlive()
1727         && this.periodicFlusher.isAlive())) {
1728       stop("One or more threads are no longer alive -- stop");
1729       return false;
1730     }
1731     final LogRoller metawalRoller = this.metawalRoller.get();
1732     if (metawalRoller != null && !metawalRoller.isAlive()) {
1733       stop("Meta WAL roller thread is no longer alive -- stop");
1734       return false;
1735     }
1736     return true;
1737   }
1738 
1739   private static final byte[] UNSPECIFIED_REGION = new byte[]{};
1740 
1741   @Override
1742   public WAL getWAL(HRegionInfo regionInfo) throws IOException {
1743     WAL wal;
1744     LogRoller roller = walRoller;
1745     //_ROOT_ and hbase:meta regions have separate WAL.
1746     if (regionInfo != null && regionInfo.isMetaTable()) {
1747       roller = ensureMetaWALRoller();
1748       wal = walFactory.getMetaWAL(regionInfo.getEncodedNameAsBytes());
1749     } else if (regionInfo == null) {
1750       wal = walFactory.getWAL(UNSPECIFIED_REGION);
1751     } else {
1752       wal = walFactory.getWAL(regionInfo.getEncodedNameAsBytes());
1753     }
1754     roller.addWAL(wal);
1755     return wal;
1756   }
1757 
1758   @Override
1759   public ClusterConnection getConnection() {
1760     return this.clusterConnection;
1761   }
1762 
1763   @Override
1764   public MetaTableLocator getMetaTableLocator() {
1765     return this.metaTableLocator;
1766   }
1767 
1768   @Override
1769   public void stop(final String msg) {
1770     if (!this.stopped) {
1771       try {
1772         if (this.rsHost != null) {
1773           this.rsHost.preStop(msg);
1774         }
1775         this.stopped = true;
1776         LOG.info("STOPPED: " + msg);
1777         // Wakes run() if it is sleeping
1778         sleeper.skipSleepCycle();
1779       } catch (IOException exp) {
1780         LOG.warn("The region server did not stop", exp);
1781       }
1782     }
1783   }
1784 
1785   public void waitForServerOnline(){
1786     while (!isStopped() && !isOnline()) {
1787       synchronized (online) {
1788         try {
1789           online.wait(msgInterval);
1790         } catch (InterruptedException ie) {
1791           Thread.currentThread().interrupt();
1792           break;
1793         }
1794       }
1795     }
1796   }
1797 
1798   @Override
1799   public void postOpenDeployTasks(final HRegion r)
1800   throws KeeperException, IOException {
1801     rpcServices.checkOpen();
1802     LOG.info("Post open deploy tasks for " + r.getRegionNameAsString());
1803     // Do checks to see if we need to compact (references or too many files)
1804     for (Store s : r.getStores().values()) {
1805       if (s.hasReferences() || s.needsCompaction()) {
1806        this.compactSplitThread.requestSystemCompaction(r, s, "Opening Region");
1807       }
1808     }
1809     long openSeqNum = r.getOpenSeqNum();
1810     if (openSeqNum == HConstants.NO_SEQNUM) {
1811       // If we opened a region, we should have read some sequence number from it.
1812       LOG.error("No sequence number found when opening " + r.getRegionNameAsString());
1813       openSeqNum = 0;
1814     }
1815 
1816     // Update flushed sequence id of a recovering region in ZK
1817     updateRecoveringRegionLastFlushedSequenceId(r);
1818 
1819     // Update ZK, or META
1820     if (r.getRegionInfo().isMetaRegion()) {
1821       MetaTableLocator.setMetaLocation(getZooKeeper(), serverName, State.OPEN);
1822     } else if (useZKForAssignment) {
1823       MetaTableAccessor.updateRegionLocation(getConnection(), r.getRegionInfo(),
1824         this.serverName, openSeqNum);
1825     }
1826     if (!useZKForAssignment && !reportRegionStateTransition(
1827         TransitionCode.OPENED, openSeqNum, r.getRegionInfo())) {
1828       throw new IOException("Failed to report opened region to master: "
1829         + r.getRegionNameAsString());
1830     }
1831 
1832     LOG.debug("Finished post open deploy task for " + r.getRegionNameAsString());
1833   }
1834 
1835   @Override
1836   public boolean reportRegionStateTransition(TransitionCode code, HRegionInfo... hris) {
1837     return reportRegionStateTransition(code, HConstants.NO_SEQNUM, hris);
1838   }
1839 
1840   @Override
1841   public boolean reportRegionStateTransition(
1842       TransitionCode code, long openSeqNum, HRegionInfo... hris) {
1843     ReportRegionStateTransitionRequest.Builder builder =
1844       ReportRegionStateTransitionRequest.newBuilder();
1845     builder.setServer(ProtobufUtil.toServerName(serverName));
1846     RegionStateTransition.Builder transition = builder.addTransitionBuilder();
1847     transition.setTransitionCode(code);
1848     if (code == TransitionCode.OPENED && openSeqNum >= 0) {
1849       transition.setOpenSeqNum(openSeqNum);
1850     }
1851     for (HRegionInfo hri: hris) {
1852       transition.addRegionInfo(HRegionInfo.convert(hri));
1853     }
1854     ReportRegionStateTransitionRequest request = builder.build();
1855     while (keepLooping()) {
1856       RegionServerStatusService.BlockingInterface rss = rssStub;
1857       try {
1858         if (rss == null) {
1859           createRegionServerStatusStub();
1860           continue;
1861         }
1862         ReportRegionStateTransitionResponse response =
1863           rss.reportRegionStateTransition(null, request);
1864         if (response.hasErrorMessage()) {
1865           LOG.info("Failed to transition " + hris[0]
1866             + " to " + code + ": " + response.getErrorMessage());
1867           return false;
1868         }
1869         return true;
1870       } catch (ServiceException se) {
1871         IOException ioe = ProtobufUtil.getRemoteException(se);
1872         LOG.info("Failed to report region transition, will retry", ioe);
1873         if (rssStub == rss) {
1874           rssStub = null;
1875         }
1876       }
1877     }
1878     return false;
1879   }
1880 
1881   @Override
1882   public RpcServerInterface getRpcServer() {
1883     return rpcServices.rpcServer;
1884   }
1885 
1886   @VisibleForTesting
1887   public RSRpcServices getRSRpcServices() {
1888     return rpcServices;
1889   }
1890 
1891   /**
1892    * Cause the server to exit without closing the regions it is serving, the log
1893    * it is using and without notifying the master. Used unit testing and on
1894    * catastrophic events such as HDFS is yanked out from under hbase or we OOME.
1895    *
1896    * @param reason
1897    *          the reason we are aborting
1898    * @param cause
1899    *          the exception that caused the abort, or null
1900    */
1901   @Override
1902   public void abort(String reason, Throwable cause) {
1903     String msg = "ABORTING region server " + this + ": " + reason;
1904     if (cause != null) {
1905       LOG.fatal(msg, cause);
1906     } else {
1907       LOG.fatal(msg);
1908     }
1909     this.abortRequested = true;
1910     // HBASE-4014: show list of coprocessors that were loaded to help debug
1911     // regionserver crashes.Note that we're implicitly using
1912     // java.util.HashSet's toString() method to print the coprocessor names.
1913     LOG.fatal("RegionServer abort: loaded coprocessors are: " +
1914         CoprocessorHost.getLoadedCoprocessors());
1915     // Try and dump metrics if abort -- might give clue as to how fatal came about....
1916     try {
1917       LOG.info("Dump of metrics as JSON on abort: " + JSONBean.dumpRegionServerMetrics());
1918     } catch (MalformedObjectNameException | IOException e) {
1919       LOG.warn("Failed dumping metrics", e);
1920     }
1921 
1922     // Do our best to report our abort to the master, but this may not work
1923     try {
1924       if (cause != null) {
1925         msg += "\nCause:\n" + StringUtils.stringifyException(cause);
1926       }
1927       // Report to the master but only if we have already registered with the master.
1928       if (rssStub != null && this.serverName != null) {
1929         ReportRSFatalErrorRequest.Builder builder =
1930           ReportRSFatalErrorRequest.newBuilder();
1931         ServerName sn =
1932           ServerName.parseVersionedServerName(this.serverName.getVersionedBytes());
1933         builder.setServer(ProtobufUtil.toServerName(sn));
1934         builder.setErrorMessage(msg);
1935         rssStub.reportRSFatalError(null, builder.build());
1936       }
1937     } catch (Throwable t) {
1938       LOG.warn("Unable to report fatal error to master", t);
1939     }
1940     stop(reason);
1941   }
1942 
1943   /**
1944    * @see HRegionServer#abort(String, Throwable)
1945    */
1946   public void abort(String reason) {
1947     abort(reason, null);
1948   }
1949 
1950   @Override
1951   public boolean isAborted() {
1952     return this.abortRequested;
1953   }
1954 
1955   /*
1956    * Simulate a kill -9 of this server. Exits w/o closing regions or cleaninup
1957    * logs but it does close socket in case want to bring up server on old
1958    * hostname+port immediately.
1959    */
1960   protected void kill() {
1961     this.killed = true;
1962     abort("Simulated kill");
1963   }
1964 
1965   /**
1966    * Wait on all threads to finish. Presumption is that all closes and stops
1967    * have already been called.
1968    */
1969   protected void stopServiceThreads() {
1970     if (this.nonceManagerChore != null) {
1971       Threads.shutdown(this.nonceManagerChore.getThread());
1972     }
1973     if (this.compactionChecker != null) {
1974       Threads.shutdown(this.compactionChecker.getThread());
1975     }
1976     if (this.periodicFlusher != null) {
1977       Threads.shutdown(this.periodicFlusher.getThread());
1978     }
1979     if (this.cacheFlusher != null) {
1980       this.cacheFlusher.join();
1981     }
1982     if (this.healthCheckChore != null) {
1983       Threads.shutdown(this.healthCheckChore.getThread());
1984     }
1985     if (this.spanReceiverHost != null) {
1986       this.spanReceiverHost.closeReceivers();
1987     }
1988     if (this.walRoller != null) {
1989       Threads.shutdown(this.walRoller.getThread());
1990     }
1991     final LogRoller metawalRoller = this.metawalRoller.get();
1992     if (metawalRoller != null) {
1993       Threads.shutdown(metawalRoller.getThread());
1994     }
1995     if (this.compactSplitThread != null) {
1996       this.compactSplitThread.join();
1997     }
1998     if (this.service != null) this.service.shutdown();
1999     if (this.replicationSourceHandler != null &&
2000         this.replicationSourceHandler == this.replicationSinkHandler) {
2001       this.replicationSourceHandler.stopReplicationService();
2002     } else {
2003       if (this.replicationSourceHandler != null) {
2004         this.replicationSourceHandler.stopReplicationService();
2005       }
2006       if (this.replicationSinkHandler != null) {
2007         this.replicationSinkHandler.stopReplicationService();
2008       }
2009     }
2010     if (this.storefileRefresher != null) {
2011       Threads.shutdown(this.storefileRefresher.getThread());
2012     }
2013   }
2014 
2015   /**
2016    * @return Return the object that implements the replication
2017    * source service.
2018    */
2019   ReplicationSourceService getReplicationSourceService() {
2020     return replicationSourceHandler;
2021   }
2022 
2023   /**
2024    * @return Return the object that implements the replication
2025    * sink service.
2026    */
2027   ReplicationSinkService getReplicationSinkService() {
2028     return replicationSinkHandler;
2029   }
2030 
2031   /**
2032    * Get the current master from ZooKeeper and open the RPC connection to it.
2033    *
2034    * Method will block until a master is available. You can break from this
2035    * block by requesting the server stop.
2036    *
2037    * @return master + port, or null if server has been stopped
2038    */
2039   private synchronized ServerName createRegionServerStatusStub() {
2040     if (rssStub != null) {
2041       return masterAddressTracker.getMasterAddress();
2042     }
2043     ServerName sn = null;
2044     long previousLogTime = 0;
2045     boolean refresh = false; // for the first time, use cached data
2046     RegionServerStatusService.BlockingInterface intf = null;
2047     boolean interrupted = false;
2048     try {
2049       while (keepLooping()) {
2050         sn = this.masterAddressTracker.getMasterAddress(refresh);
2051         if (sn == null) {
2052           if (!keepLooping()) {
2053             // give up with no connection.
2054             LOG.debug("No master found and cluster is stopped; bailing out");
2055             return null;
2056           }
2057           if (System.currentTimeMillis() > (previousLogTime + 1000)) {
2058             LOG.debug("No master found; retry");
2059             previousLogTime = System.currentTimeMillis();
2060           }
2061           refresh = true; // let's try pull it from ZK directly
2062           if (sleep(200)) {
2063             interrupted = true;
2064           }
2065           continue;
2066         }
2067 
2068         // If we are on the active master, use the shortcut
2069         if (this instanceof HMaster && sn.equals(getServerName())) {
2070           intf = ((HMaster)this).getMasterRpcServices();
2071           break;
2072         }
2073         try {
2074           BlockingRpcChannel channel =
2075             this.rpcClient.createBlockingRpcChannel(sn, userProvider.getCurrent(), operationTimeout);
2076           intf = RegionServerStatusService.newBlockingStub(channel);
2077           break;
2078         } catch (IOException e) {
2079           if (System.currentTimeMillis() > (previousLogTime + 1000)) {
2080             e = e instanceof RemoteException ?
2081               ((RemoteException)e).unwrapRemoteException() : e;
2082             if (e instanceof ServerNotRunningYetException) {
2083               LOG.info("Master isn't available yet, retrying");
2084             } else {
2085               LOG.warn("Unable to connect to master. Retrying. Error was:", e);
2086             }
2087             previousLogTime = System.currentTimeMillis();
2088           }
2089           if (sleep(200)) {
2090             interrupted = true;
2091           }
2092         }
2093       }
2094     } finally {
2095       if (interrupted) {
2096         Thread.currentThread().interrupt();
2097       }
2098     }
2099     rssStub = intf;
2100     return sn;
2101   }
2102 
2103   /**
2104    * @return True if we should break loop because cluster is going down or
2105    * this server has been stopped or hdfs has gone bad.
2106    */
2107   private boolean keepLooping() {
2108     return !this.stopped && isClusterUp();
2109   }
2110 
2111   /*
2112    * Let the master know we're here Run initialization using parameters passed
2113    * us by the master.
2114    * @return A Map of key/value configurations we got from the Master else
2115    * null if we failed to register.
2116    * @throws IOException
2117    */
2118   private RegionServerStartupResponse reportForDuty() throws IOException {
2119     ServerName masterServerName = createRegionServerStatusStub();
2120     if (masterServerName == null) return null;
2121     RegionServerStartupResponse result = null;
2122     try {
2123       rpcServices.requestCount.set(0);
2124       LOG.info("reportForDuty to master=" + masterServerName + " with port="
2125         + rpcServices.isa.getPort() + ", startcode=" + this.startcode);
2126       long now = EnvironmentEdgeManager.currentTime();
2127       int port = rpcServices.isa.getPort();
2128       RegionServerStartupRequest.Builder request = RegionServerStartupRequest.newBuilder();
2129       request.setPort(port);
2130       request.setServerStartCode(this.startcode);
2131       request.setServerCurrentTime(now);
2132       result = this.rssStub.regionServerStartup(null, request.build());
2133     } catch (ServiceException se) {
2134       IOException ioe = ProtobufUtil.getRemoteException(se);
2135       if (ioe instanceof ClockOutOfSyncException) {
2136         LOG.fatal("Master rejected startup because clock is out of sync", ioe);
2137         // Re-throw IOE will cause RS to abort
2138         throw ioe;
2139       } else if (ioe instanceof ServerNotRunningYetException) {
2140         LOG.debug("Master is not running yet");
2141       } else {
2142         LOG.warn("error telling master we are up", se);
2143       }
2144     }
2145     return result;
2146   }
2147 
2148   @Override
2149   public long getLastSequenceId(byte[] encodedRegionName) {
2150     long lastFlushedSequenceId = -1L;
2151     try {
2152       GetLastFlushedSequenceIdRequest req = RequestConverter
2153           .buildGetLastFlushedSequenceIdRequest(encodedRegionName);
2154       RegionServerStatusService.BlockingInterface rss = rssStub;
2155       if (rss == null) { // Try to connect one more time
2156         createRegionServerStatusStub();
2157         rss = rssStub;
2158         if (rss == null) {
2159           // Still no luck, we tried
2160           LOG.warn("Unable to connect to the master to check "
2161             + "the last flushed sequence id");
2162           return -1L;
2163         }
2164       }
2165       lastFlushedSequenceId = rss.getLastFlushedSequenceId(null, req)
2166           .getLastFlushedSequenceId();
2167     } catch (ServiceException e) {
2168       lastFlushedSequenceId = -1l;
2169       LOG.warn("Unable to connect to the master to check "
2170         + "the last flushed sequence id", e);
2171     }
2172     return lastFlushedSequenceId;
2173   }
2174 
2175   /**
2176    * Closes all regions.  Called on our way out.
2177    * Assumes that its not possible for new regions to be added to onlineRegions
2178    * while this method runs.
2179    */
2180   protected void closeAllRegions(final boolean abort) {
2181     closeUserRegions(abort);
2182     closeMetaTableRegions(abort);
2183   }
2184 
2185   /**
2186    * Close meta region if we carry it
2187    * @param abort Whether we're running an abort.
2188    */
2189   void closeMetaTableRegions(final boolean abort) {
2190     HRegion meta = null;
2191     this.lock.writeLock().lock();
2192     try {
2193       for (Map.Entry<String, HRegion> e: onlineRegions.entrySet()) {
2194         HRegionInfo hri = e.getValue().getRegionInfo();
2195         if (hri.isMetaRegion()) {
2196           meta = e.getValue();
2197         }
2198         if (meta != null) break;
2199       }
2200     } finally {
2201       this.lock.writeLock().unlock();
2202     }
2203     if (meta != null) closeRegionIgnoreErrors(meta.getRegionInfo(), abort);
2204   }
2205 
2206   /**
2207    * Schedule closes on all user regions.
2208    * Should be safe calling multiple times because it wont' close regions
2209    * that are already closed or that are closing.
2210    * @param abort Whether we're running an abort.
2211    */
2212   void closeUserRegions(final boolean abort) {
2213     this.lock.writeLock().lock();
2214     try {
2215       for (Map.Entry<String, HRegion> e: this.onlineRegions.entrySet()) {
2216         HRegion r = e.getValue();
2217         if (!r.getRegionInfo().isMetaTable() && r.isAvailable()) {
2218           // Don't update zk with this close transition; pass false.
2219           closeRegionIgnoreErrors(r.getRegionInfo(), abort);
2220         }
2221       }
2222     } finally {
2223       this.lock.writeLock().unlock();
2224     }
2225   }
2226 
2227   /** @return the info server */
2228   public InfoServer getInfoServer() {
2229     return infoServer;
2230   }
2231 
2232   /**
2233    * @return true if a stop has been requested.
2234    */
2235   @Override
2236   public boolean isStopped() {
2237     return this.stopped;
2238   }
2239 
2240   @Override
2241   public boolean isStopping() {
2242     return this.stopping;
2243   }
2244 
2245   @Override
2246   public Map<String, HRegion> getRecoveringRegions() {
2247     return this.recoveringRegions;
2248   }
2249 
2250   /**
2251    *
2252    * @return the configuration
2253    */
2254   @Override
2255   public Configuration getConfiguration() {
2256     return conf;
2257   }
2258 
2259   /** @return the write lock for the server */
2260   ReentrantReadWriteLock.WriteLock getWriteLock() {
2261     return lock.writeLock();
2262   }
2263 
2264   public int getNumberOfOnlineRegions() {
2265     return this.onlineRegions.size();
2266   }
2267 
2268   boolean isOnlineRegionsEmpty() {
2269     return this.onlineRegions.isEmpty();
2270   }
2271 
2272   /**
2273    * For tests, web ui and metrics.
2274    * This method will only work if HRegionServer is in the same JVM as client;
2275    * HRegion cannot be serialized to cross an rpc.
2276    */
2277   public Collection<HRegion> getOnlineRegionsLocalContext() {
2278     Collection<HRegion> regions = this.onlineRegions.values();
2279     return Collections.unmodifiableCollection(regions);
2280   }
2281 
2282   @Override
2283   public void addToOnlineRegions(HRegion region) {
2284     this.onlineRegions.put(region.getRegionInfo().getEncodedName(), region);
2285     configurationManager.registerObserver(region);
2286   }
2287 
2288   /**
2289    * @return A new Map of online regions sorted by region size with the first entry being the
2290    * biggest.  If two regions are the same size, then the last one found wins; i.e. this method
2291    * may NOT return all regions.
2292    */
2293   SortedMap<Long, HRegion> getCopyOfOnlineRegionsSortedBySize() {
2294     // we'll sort the regions in reverse
2295     SortedMap<Long, HRegion> sortedRegions = new TreeMap<Long, HRegion>(
2296         new Comparator<Long>() {
2297           @Override
2298           public int compare(Long a, Long b) {
2299             return -1 * a.compareTo(b);
2300           }
2301         });
2302     // Copy over all regions. Regions are sorted by size with biggest first.
2303     for (HRegion region : this.onlineRegions.values()) {
2304       sortedRegions.put(region.memstoreSize.get(), region);
2305     }
2306     return sortedRegions;
2307   }
2308 
2309   /**
2310    * @return time stamp in millis of when this region server was started
2311    */
2312   public long getStartcode() {
2313     return this.startcode;
2314   }
2315 
2316   /** @return reference to FlushRequester */
2317   @Override
2318   public FlushRequester getFlushRequester() {
2319     return this.cacheFlusher;
2320   }
2321 
2322   /**
2323    * Get the top N most loaded regions this server is serving so we can tell the
2324    * master which regions it can reallocate if we're overloaded. TODO: actually
2325    * calculate which regions are most loaded. (Right now, we're just grabbing
2326    * the first N regions being served regardless of load.)
2327    */
2328   protected HRegionInfo[] getMostLoadedRegions() {
2329     ArrayList<HRegionInfo> regions = new ArrayList<HRegionInfo>();
2330     for (HRegion r : onlineRegions.values()) {
2331       if (!r.isAvailable()) {
2332         continue;
2333       }
2334       if (regions.size() < numRegionsToReport) {
2335         regions.add(r.getRegionInfo());
2336       } else {
2337         break;
2338       }
2339     }
2340     return regions.toArray(new HRegionInfo[regions.size()]);
2341   }
2342 
2343   @Override
2344   public Leases getLeases() {
2345     return leases;
2346   }
2347 
2348   /**
2349    * @return Return the rootDir.
2350    */
2351   protected Path getRootDir() {
2352     return rootDir;
2353   }
2354 
2355   /**
2356    * @return Return the fs.
2357    */
2358   @Override
2359   public FileSystem getFileSystem() {
2360     return fs;
2361   }
2362 
2363   @Override
2364   public String toString() {
2365     return getServerName().toString();
2366   }
2367 
2368   /**
2369    * Interval at which threads should run
2370    *
2371    * @return the interval
2372    */
2373   public int getThreadWakeFrequency() {
2374     return threadWakeFrequency;
2375   }
2376 
2377   @Override
2378   public ZooKeeperWatcher getZooKeeper() {
2379     return zooKeeper;
2380   }
2381 
2382   @Override
2383   public BaseCoordinatedStateManager getCoordinatedStateManager() {
2384     return csm;
2385   }
2386 
2387   @Override
2388   public ServerName getServerName() {
2389     return serverName;
2390   }
2391 
2392   @Override
2393   public CompactionRequestor getCompactionRequester() {
2394     return this.compactSplitThread;
2395   }
2396 
2397   public RegionServerCoprocessorHost getRegionServerCoprocessorHost(){
2398     return this.rsHost;
2399   }
2400 
2401   @Override
2402   public ConcurrentMap<byte[], Boolean> getRegionsInTransitionInRS() {
2403     return this.regionsInTransitionInRS;
2404   }
2405 
2406   @Override
2407   public ExecutorService getExecutorService() {
2408     return service;
2409   }
2410 
2411   //
2412   // Main program and support routines
2413   //
2414 
2415   /**
2416    * Load the replication service objects, if any
2417    */
2418   static private void createNewReplicationInstance(Configuration conf,
2419     HRegionServer server, FileSystem fs, Path logDir, Path oldLogDir) throws IOException{
2420 
2421     // If replication is not enabled, then return immediately.
2422     if (!conf.getBoolean(HConstants.REPLICATION_ENABLE_KEY,
2423         HConstants.REPLICATION_ENABLE_DEFAULT)) {
2424       return;
2425     }
2426 
2427     // read in the name of the source replication class from the config file.
2428     String sourceClassname = conf.get(HConstants.REPLICATION_SOURCE_SERVICE_CLASSNAME,
2429                                HConstants.REPLICATION_SERVICE_CLASSNAME_DEFAULT);
2430 
2431     // read in the name of the sink replication class from the config file.
2432     String sinkClassname = conf.get(HConstants.REPLICATION_SINK_SERVICE_CLASSNAME,
2433                              HConstants.REPLICATION_SERVICE_CLASSNAME_DEFAULT);
2434 
2435     // If both the sink and the source class names are the same, then instantiate
2436     // only one object.
2437     if (sourceClassname.equals(sinkClassname)) {
2438       server.replicationSourceHandler = (ReplicationSourceService)
2439                                          newReplicationInstance(sourceClassname,
2440                                          conf, server, fs, logDir, oldLogDir);
2441       server.replicationSinkHandler = (ReplicationSinkService)
2442                                          server.replicationSourceHandler;
2443     } else {
2444       server.replicationSourceHandler = (ReplicationSourceService)
2445                                          newReplicationInstance(sourceClassname,
2446                                          conf, server, fs, logDir, oldLogDir);
2447       server.replicationSinkHandler = (ReplicationSinkService)
2448                                          newReplicationInstance(sinkClassname,
2449                                          conf, server, fs, logDir, oldLogDir);
2450     }
2451   }
2452 
2453   static private ReplicationService newReplicationInstance(String classname,
2454     Configuration conf, HRegionServer server, FileSystem fs, Path logDir,
2455     Path oldLogDir) throws IOException{
2456 
2457     Class<?> clazz = null;
2458     try {
2459       ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
2460       clazz = Class.forName(classname, true, classLoader);
2461     } catch (java.lang.ClassNotFoundException nfe) {
2462       throw new IOException("Could not find class for " + classname);
2463     }
2464 
2465     // create an instance of the replication object.
2466     ReplicationService service = (ReplicationService)
2467                               ReflectionUtils.newInstance(clazz, conf);
2468     service.initialize(server, fs, logDir, oldLogDir);
2469     return service;
2470   }
2471 
2472   /**
2473    * Utility for constructing an instance of the passed HRegionServer class.
2474    *
2475    * @param regionServerClass
2476    * @param conf2
2477    * @return HRegionServer instance.
2478    */
2479   public static HRegionServer constructRegionServer(
2480       Class<? extends HRegionServer> regionServerClass,
2481       final Configuration conf2, CoordinatedStateManager cp) {
2482     try {
2483       Constructor<? extends HRegionServer> c = regionServerClass
2484           .getConstructor(Configuration.class, CoordinatedStateManager.class);
2485       return c.newInstance(conf2, cp);
2486     } catch (Exception e) {
2487       throw new RuntimeException("Failed construction of " + "Regionserver: "
2488           + regionServerClass.toString(), e);
2489     }
2490   }
2491 
2492   /**
2493    * @see org.apache.hadoop.hbase.regionserver.HRegionServerCommandLine
2494    */
2495   public static void main(String[] args) throws Exception {
2496     VersionInfo.logVersion();
2497     Configuration conf = HBaseConfiguration.create();
2498     @SuppressWarnings("unchecked")
2499     Class<? extends HRegionServer> regionServerClass = (Class<? extends HRegionServer>) conf
2500         .getClass(HConstants.REGION_SERVER_IMPL, HRegionServer.class);
2501 
2502     new HRegionServerCommandLine(regionServerClass).doMain(args);
2503   }
2504 
2505   /**
2506    * Gets the online regions of the specified table.
2507    * This method looks at the in-memory onlineRegions.  It does not go to <code>hbase:meta</code>.
2508    * Only returns <em>online</em> regions.  If a region on this table has been
2509    * closed during a disable, etc., it will not be included in the returned list.
2510    * So, the returned list may not necessarily be ALL regions in this table, its
2511    * all the ONLINE regions in the table.
2512    * @param tableName
2513    * @return Online regions from <code>tableName</code>
2514    */
2515   @Override
2516   public List<HRegion> getOnlineRegions(TableName tableName) {
2517      List<HRegion> tableRegions = new ArrayList<HRegion>();
2518      synchronized (this.onlineRegions) {
2519        for (HRegion region: this.onlineRegions.values()) {
2520          HRegionInfo regionInfo = region.getRegionInfo();
2521          if(regionInfo.getTable().equals(tableName)) {
2522            tableRegions.add(region);
2523          }
2524        }
2525      }
2526      return tableRegions;
2527    }
2528 
2529   // used by org/apache/hbase/tmpl/regionserver/RSStatusTmpl.jamon (HBASE-4070).
2530   public String[] getRegionServerCoprocessors() {
2531     TreeSet<String> coprocessors = new TreeSet<String>();
2532     try {
2533       coprocessors.addAll(getWAL(null).getCoprocessorHost().getCoprocessors());
2534     } catch (IOException exception) {
2535       LOG.warn("Exception attempting to fetch wal coprocessor information for the common wal; " +
2536           "skipping.");
2537       LOG.debug("Exception details for failure to fetch wal coprocessor information.", exception);
2538     }
2539     Collection<HRegion> regions = getOnlineRegionsLocalContext();
2540     for (HRegion region: regions) {
2541       coprocessors.addAll(region.getCoprocessorHost().getCoprocessors());
2542       try {
2543         coprocessors.addAll(getWAL(region.getRegionInfo()).getCoprocessorHost().getCoprocessors());
2544       } catch (IOException exception) {
2545         LOG.warn("Exception attempting to fetch wal coprocessor information for region " + region +
2546             "; skipping.");
2547         LOG.debug("Exception details for failure to fetch wal coprocessor information.", exception);
2548       }
2549     }
2550     return coprocessors.toArray(new String[coprocessors.size()]);
2551   }
2552 
2553   /**
2554    * Try to close the region, logs a warning on failure but continues.
2555    * @param region Region to close
2556    */
2557   private void closeRegionIgnoreErrors(HRegionInfo region, final boolean abort) {
2558     try {
2559       CloseRegionCoordination.CloseRegionDetails details =
2560         csm.getCloseRegionCoordination().getDetaultDetails();
2561       if (!closeRegion(region.getEncodedName(), abort, details, null)) {
2562         LOG.warn("Failed to close " + region.getRegionNameAsString() +
2563             " - ignoring and continuing");
2564       }
2565     } catch (IOException e) {
2566       LOG.warn("Failed to close " + region.getRegionNameAsString() +
2567           " - ignoring and continuing", e);
2568     }
2569   }
2570 
2571   /**
2572    * Close asynchronously a region, can be called from the master or internally by the regionserver
2573    * when stopping. If called from the master, the region will update the znode status.
2574    *
2575    * <p>
2576    * If an opening was in progress, this method will cancel it, but will not start a new close. The
2577    * coprocessors are not called in this case. A NotServingRegionException exception is thrown.
2578    * </p>
2579 
2580    * <p>
2581    *   If a close was in progress, this new request will be ignored, and an exception thrown.
2582    * </p>
2583    *
2584    * @param encodedName Region to close
2585    * @param abort True if we are aborting
2586    * @param crd details about closing region coordination-coordinated task
2587    * @return True if closed a region.
2588    * @throws NotServingRegionException if the region is not online
2589    * @throws RegionAlreadyInTransitionException if the region is already closing
2590    */
2591   protected boolean closeRegion(String encodedName, final boolean abort,
2592       CloseRegionCoordination.CloseRegionDetails crd, final ServerName sn)
2593       throws NotServingRegionException, RegionAlreadyInTransitionException {
2594     //Check for permissions to close.
2595     HRegion actualRegion = this.getFromOnlineRegions(encodedName);
2596     if ((actualRegion != null) && (actualRegion.getCoprocessorHost() != null)) {
2597       try {
2598         actualRegion.getCoprocessorHost().preClose(false);
2599       } catch (IOException exp) {
2600         LOG.warn("Unable to close region: the coprocessor launched an error ", exp);
2601         return false;
2602       }
2603     }
2604 
2605     final Boolean previous = this.regionsInTransitionInRS.putIfAbsent(encodedName.getBytes(),
2606         Boolean.FALSE);
2607 
2608     if (Boolean.TRUE.equals(previous)) {
2609       LOG.info("Received CLOSE for the region:" + encodedName + " , which we are already " +
2610           "trying to OPEN. Cancelling OPENING.");
2611       if (!regionsInTransitionInRS.replace(encodedName.getBytes(), previous, Boolean.FALSE)){
2612         // The replace failed. That should be an exceptional case, but theoretically it can happen.
2613         // We're going to try to do a standard close then.
2614         LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." +
2615             " Doing a standard close now");
2616         return closeRegion(encodedName, abort, crd, sn);
2617       }
2618       // Let's get the region from the online region list again
2619       actualRegion = this.getFromOnlineRegions(encodedName);
2620       if (actualRegion == null) { // If already online, we still need to close it.
2621         LOG.info("The opening previously in progress has been cancelled by a CLOSE request.");
2622         // The master deletes the znode when it receives this exception.
2623         throw new RegionAlreadyInTransitionException("The region " + encodedName +
2624           " was opening but not yet served. Opening is cancelled.");
2625       }
2626     } else if (Boolean.FALSE.equals(previous)) {
2627       LOG.info("Received CLOSE for the region: " + encodedName +
2628         ", which we are already trying to CLOSE, but not completed yet");
2629       // The master will retry till the region is closed. We need to do this since
2630       // the region could fail to close somehow. If we mark the region closed in master
2631       // while it is not, there could be data loss.
2632       // If the region stuck in closing for a while, and master runs out of retries,
2633       // master will move the region to failed_to_close. Later on, if the region
2634       // is indeed closed, master can properly re-assign it.
2635       throw new RegionAlreadyInTransitionException("The region " + encodedName +
2636         " was already closing. New CLOSE request is ignored.");
2637     }
2638 
2639     if (actualRegion == null) {
2640       LOG.error("Received CLOSE for a region which is not online, and we're not opening.");
2641       this.regionsInTransitionInRS.remove(encodedName.getBytes());
2642       // The master deletes the znode when it receives this exception.
2643       throw new NotServingRegionException("The region " + encodedName +
2644           " is not online, and is not opening.");
2645     }
2646 
2647     CloseRegionHandler crh;
2648     final HRegionInfo hri = actualRegion.getRegionInfo();
2649     if (hri.isMetaRegion()) {
2650       crh = new CloseMetaHandler(this, this, hri, abort,
2651         csm.getCloseRegionCoordination(), crd);
2652     } else {
2653       crh = new CloseRegionHandler(this, this, hri, abort,
2654         csm.getCloseRegionCoordination(), crd, sn);
2655     }
2656     this.service.submit(crh);
2657     return true;
2658   }
2659 
2660    /**
2661    * @param regionName
2662    * @return HRegion for the passed binary <code>regionName</code> or null if
2663    *         named region is not member of the online regions.
2664    */
2665   public HRegion getOnlineRegion(final byte[] regionName) {
2666     String encodedRegionName = HRegionInfo.encodeRegionName(regionName);
2667     return this.onlineRegions.get(encodedRegionName);
2668   }
2669 
2670   public InetSocketAddress[] getRegionBlockLocations(final String encodedRegionName) {
2671     return this.regionFavoredNodesMap.get(encodedRegionName);
2672   }
2673 
2674   @Override
2675   public HRegion getFromOnlineRegions(final String encodedRegionName) {
2676     return this.onlineRegions.get(encodedRegionName);
2677   }
2678 
2679 
2680   @Override
2681   public boolean removeFromOnlineRegions(final HRegion r, ServerName destination) {
2682     HRegion toReturn = this.onlineRegions.remove(r.getRegionInfo().getEncodedName());
2683 
2684     if (destination != null) {
2685       try {
2686         WAL wal = getWAL(r.getRegionInfo());
2687         long closeSeqNum = wal.getEarliestMemstoreSeqNum(r.getRegionInfo().getEncodedNameAsBytes());
2688         if (closeSeqNum == HConstants.NO_SEQNUM) {
2689           // No edits in WAL for this region; get the sequence number when the region was opened.
2690           closeSeqNum = r.getOpenSeqNum();
2691           if (closeSeqNum == HConstants.NO_SEQNUM) {
2692             closeSeqNum = 0;
2693           }
2694         }
2695         addToMovedRegions(r.getRegionInfo().getEncodedName(), destination, closeSeqNum);
2696       } catch (IOException exception) {
2697         LOG.error("Could not retrieve WAL information for region " + r.getRegionInfo() +
2698             "; not adding to moved regions.");
2699         LOG.debug("Exception details for failure to get wal", exception);
2700       }
2701     }
2702     this.regionFavoredNodesMap.remove(r.getRegionInfo().getEncodedName());
2703     return toReturn != null;
2704   }
2705 
2706   /**
2707    * Protected utility method for safely obtaining an HRegion handle.
2708    *
2709    * @param regionName
2710    *          Name of online {@link HRegion} to return
2711    * @return {@link HRegion} for <code>regionName</code>
2712    * @throws NotServingRegionException
2713    */
2714   protected HRegion getRegion(final byte[] regionName)
2715       throws NotServingRegionException {
2716     String encodedRegionName = HRegionInfo.encodeRegionName(regionName);
2717     return getRegionByEncodedName(regionName, encodedRegionName);
2718   }
2719 
2720   public HRegion getRegionByEncodedName(String encodedRegionName)
2721       throws NotServingRegionException {
2722     return getRegionByEncodedName(null, encodedRegionName);
2723   }
2724 
2725   protected HRegion getRegionByEncodedName(byte[] regionName, String encodedRegionName)
2726     throws NotServingRegionException {
2727     HRegion region = this.onlineRegions.get(encodedRegionName);
2728     if (region == null) {
2729       MovedRegionInfo moveInfo = getMovedRegion(encodedRegionName);
2730       if (moveInfo != null) {
2731         throw new RegionMovedException(moveInfo.getServerName(), moveInfo.getSeqNum());
2732       }
2733       Boolean isOpening = this.regionsInTransitionInRS.get(Bytes.toBytes(encodedRegionName));
2734       String regionNameStr = regionName == null?
2735         encodedRegionName: Bytes.toStringBinary(regionName);
2736       if (isOpening != null && isOpening.booleanValue()) {
2737         throw new RegionOpeningException("Region " + regionNameStr +
2738           " is opening on " + this.serverName);
2739       }
2740       throw new NotServingRegionException("Region " + regionNameStr +
2741         " is not online on " + this.serverName);
2742     }
2743     return region;
2744   }
2745 
2746   /*
2747    * Cleanup after Throwable caught invoking method. Converts <code>t</code> to
2748    * IOE if it isn't already.
2749    *
2750    * @param t Throwable
2751    *
2752    * @param msg Message to log in error. Can be null.
2753    *
2754    * @return Throwable converted to an IOE; methods can only let out IOEs.
2755    */
2756   private Throwable cleanup(final Throwable t, final String msg) {
2757     // Don't log as error if NSRE; NSRE is 'normal' operation.
2758     if (t instanceof NotServingRegionException) {
2759       LOG.debug("NotServingRegionException; " + t.getMessage());
2760       return t;
2761     }
2762     if (msg == null) {
2763       LOG.error("", RemoteExceptionHandler.checkThrowable(t));
2764     } else {
2765       LOG.error(msg, RemoteExceptionHandler.checkThrowable(t));
2766     }
2767     if (!rpcServices.checkOOME(t)) {
2768       checkFileSystem();
2769     }
2770     return t;
2771   }
2772 
2773   /*
2774    * @param t
2775    *
2776    * @param msg Message to put in new IOE if passed <code>t</code> is not an IOE
2777    *
2778    * @return Make <code>t</code> an IOE if it isn't already.
2779    */
2780   protected IOException convertThrowableToIOE(final Throwable t, final String msg) {
2781     return (t instanceof IOException ? (IOException) t : msg == null
2782         || msg.length() == 0 ? new IOException(t) : new IOException(msg, t));
2783   }
2784 
2785   /**
2786    * Checks to see if the file system is still accessible. If not, sets
2787    * abortRequested and stopRequested
2788    *
2789    * @return false if file system is not available
2790    */
2791   public boolean checkFileSystem() {
2792     if (this.fsOk && this.fs != null) {
2793       try {
2794         FSUtils.checkFileSystemAvailable(this.fs);
2795       } catch (IOException e) {
2796         abort("File System not available", e);
2797         this.fsOk = false;
2798       }
2799     }
2800     return this.fsOk;
2801   }
2802 
2803   @Override
2804   public void updateRegionFavoredNodesMapping(String encodedRegionName,
2805       List<org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.ServerName> favoredNodes) {
2806     InetSocketAddress[] addr = new InetSocketAddress[favoredNodes.size()];
2807     // Refer to the comment on the declaration of regionFavoredNodesMap on why
2808     // it is a map of region name to InetSocketAddress[]
2809     for (int i = 0; i < favoredNodes.size(); i++) {
2810       addr[i] = InetSocketAddress.createUnresolved(favoredNodes.get(i).getHostName(),
2811           favoredNodes.get(i).getPort());
2812     }
2813     regionFavoredNodesMap.put(encodedRegionName, addr);
2814   }
2815 
2816   /**
2817    * Return the favored nodes for a region given its encoded name. Look at the
2818    * comment around {@link #regionFavoredNodesMap} on why it is InetSocketAddress[]
2819    * @param encodedRegionName
2820    * @return array of favored locations
2821    */
2822   @Override
2823   public InetSocketAddress[] getFavoredNodesForRegion(String encodedRegionName) {
2824     return regionFavoredNodesMap.get(encodedRegionName);
2825   }
2826 
2827   @Override
2828   public ServerNonceManager getNonceManager() {
2829     return this.nonceManager;
2830   }
2831 
2832   private static class MovedRegionInfo {
2833     private final ServerName serverName;
2834     private final long seqNum;
2835     private final long ts;
2836 
2837     public MovedRegionInfo(ServerName serverName, long closeSeqNum) {
2838       this.serverName = serverName;
2839       this.seqNum = closeSeqNum;
2840       ts = EnvironmentEdgeManager.currentTime();
2841      }
2842 
2843     public ServerName getServerName() {
2844       return serverName;
2845     }
2846 
2847     public long getSeqNum() {
2848       return seqNum;
2849     }
2850 
2851     public long getMoveTime() {
2852       return ts;
2853     }
2854   }
2855 
2856   // This map will contains all the regions that we closed for a move.
2857   //  We add the time it was moved as we don't want to keep too old information
2858   protected Map<String, MovedRegionInfo> movedRegions =
2859       new ConcurrentHashMap<String, MovedRegionInfo>(3000);
2860 
2861   // We need a timeout. If not there is a risk of giving a wrong information: this would double
2862   //  the number of network calls instead of reducing them.
2863   private static final int TIMEOUT_REGION_MOVED = (2 * 60 * 1000);
2864 
2865   protected void addToMovedRegions(String encodedName, ServerName destination, long closeSeqNum) {
2866     if (ServerName.isSameHostnameAndPort(destination, this.getServerName())) {
2867       LOG.warn("Not adding moved region record: " + encodedName + " to self.");
2868       return;
2869     }
2870     LOG.info("Adding moved region record: "
2871       + encodedName + " to " + destination + " as of " + closeSeqNum);
2872     movedRegions.put(encodedName, new MovedRegionInfo(destination, closeSeqNum));
2873   }
2874 
2875   void removeFromMovedRegions(String encodedName) {
2876     movedRegions.remove(encodedName);
2877   }
2878 
2879   private MovedRegionInfo getMovedRegion(final String encodedRegionName) {
2880     MovedRegionInfo dest = movedRegions.get(encodedRegionName);
2881 
2882     long now = EnvironmentEdgeManager.currentTime();
2883     if (dest != null) {
2884       if (dest.getMoveTime() > (now - TIMEOUT_REGION_MOVED)) {
2885         return dest;
2886       } else {
2887         movedRegions.remove(encodedRegionName);
2888       }
2889     }
2890 
2891     return null;
2892   }
2893 
2894   /**
2895    * Remove the expired entries from the moved regions list.
2896    */
2897   protected void cleanMovedRegions() {
2898     final long cutOff = System.currentTimeMillis() - TIMEOUT_REGION_MOVED;
2899     Iterator<Entry<String, MovedRegionInfo>> it = movedRegions.entrySet().iterator();
2900 
2901     while (it.hasNext()){
2902       Map.Entry<String, MovedRegionInfo> e = it.next();
2903       if (e.getValue().getMoveTime() < cutOff) {
2904         it.remove();
2905       }
2906     }
2907   }
2908 
2909   /**
2910    * Creates a Chore thread to clean the moved region cache.
2911    */
2912   protected static class MovedRegionsCleaner extends Chore implements Stoppable {
2913     private HRegionServer regionServer;
2914     Stoppable stoppable;
2915 
2916     private MovedRegionsCleaner(
2917       HRegionServer regionServer, Stoppable stoppable){
2918       super("MovedRegionsCleaner for region "+regionServer, TIMEOUT_REGION_MOVED, stoppable);
2919       this.regionServer = regionServer;
2920       this.stoppable = stoppable;
2921     }
2922 
2923     static MovedRegionsCleaner createAndStart(HRegionServer rs){
2924       Stoppable stoppable = new Stoppable() {
2925         private volatile boolean isStopped = false;
2926         @Override public void stop(String why) { isStopped = true;}
2927         @Override public boolean isStopped() {return isStopped;}
2928       };
2929 
2930       return new MovedRegionsCleaner(rs, stoppable);
2931     }
2932 
2933     @Override
2934     protected void chore() {
2935       regionServer.cleanMovedRegions();
2936     }
2937 
2938     @Override
2939     public void stop(String why) {
2940       stoppable.stop(why);
2941     }
2942 
2943     @Override
2944     public boolean isStopped() {
2945       return stoppable.isStopped();
2946     }
2947   }
2948 
2949   private String getMyEphemeralNodePath() {
2950     return ZKUtil.joinZNode(this.zooKeeper.rsZNode, getServerName().toString());
2951   }
2952 
2953   private boolean isHealthCheckerConfigured() {
2954     String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
2955     return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
2956   }
2957 
2958   /**
2959    * @return the underlying {@link CompactSplitThread} for the servers
2960    */
2961   public CompactSplitThread getCompactSplitThread() {
2962     return this.compactSplitThread;
2963   }
2964 
2965   /**
2966    * A helper function to store the last flushed sequence Id with the previous failed RS for a
2967    * recovering region. The Id is used to skip wal edits which are flushed. Since the flushed
2968    * sequence id is only valid for each RS, we associate the Id with corresponding failed RS.
2969    * @throws KeeperException
2970    * @throws IOException
2971    */
2972   private void updateRecoveringRegionLastFlushedSequenceId(HRegion r) throws KeeperException,
2973       IOException {
2974     if (!r.isRecovering()) {
2975       // return immdiately for non-recovering regions
2976       return;
2977     }
2978 
2979     HRegionInfo region = r.getRegionInfo();
2980     ZooKeeperWatcher zkw = getZooKeeper();
2981     String previousRSName = this.getLastFailedRSFromZK(region.getEncodedName());
2982     Map<byte[], Long> maxSeqIdInStores = r.getMaxStoreSeqIdForLogReplay();
2983     long minSeqIdForLogReplay = -1;
2984     for (Long storeSeqIdForReplay : maxSeqIdInStores.values()) {
2985       if (minSeqIdForLogReplay == -1 || storeSeqIdForReplay < minSeqIdForLogReplay) {
2986         minSeqIdForLogReplay = storeSeqIdForReplay;
2987       }
2988     }
2989 
2990     try {
2991       long lastRecordedFlushedSequenceId = -1;
2992       String nodePath = ZKUtil.joinZNode(this.zooKeeper.recoveringRegionsZNode,
2993         region.getEncodedName());
2994       // recovering-region level
2995       byte[] data;
2996       try {
2997         data = ZKUtil.getData(zkw, nodePath);
2998       } catch (InterruptedException e) {
2999         throw new InterruptedIOException();
3000       }
3001       if (data != null) {
3002       lastRecordedFlushedSequenceId = ZKSplitLog.parseLastFlushedSequenceIdFrom(data);
3003       }
3004       if (data == null || lastRecordedFlushedSequenceId < minSeqIdForLogReplay) {
3005         ZKUtil.setData(zkw, nodePath, ZKUtil.positionToByteArray(minSeqIdForLogReplay));
3006       }
3007       if (previousRSName != null) {
3008         // one level deeper for the failed RS
3009         nodePath = ZKUtil.joinZNode(nodePath, previousRSName);
3010         ZKUtil.setData(zkw, nodePath,
3011           ZKUtil.regionSequenceIdsToByteArray(minSeqIdForLogReplay, maxSeqIdInStores));
3012         LOG.debug("Update last flushed sequence id of region " + region.getEncodedName() + " for "
3013             + previousRSName);
3014       } else {
3015         LOG.warn("Can't find failed region server for recovering region " +
3016           region.getEncodedName());
3017       }
3018     } catch (NoNodeException ignore) {
3019       LOG.debug("Region " + region.getEncodedName() +
3020         " must have completed recovery because its recovery znode has been removed", ignore);
3021     }
3022   }
3023 
3024   /**
3025    * Return the last failed RS name under /hbase/recovering-regions/encodedRegionName
3026    * @param encodedRegionName
3027    * @throws KeeperException
3028    */
3029   private String getLastFailedRSFromZK(String encodedRegionName) throws KeeperException {
3030     String result = null;
3031     long maxZxid = 0;
3032     ZooKeeperWatcher zkw = this.getZooKeeper();
3033     String nodePath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, encodedRegionName);
3034     List<String> failedServers = ZKUtil.listChildrenNoWatch(zkw, nodePath);
3035     if (failedServers == null || failedServers.isEmpty()) {
3036       return result;
3037     }
3038     for (String failedServer : failedServers) {
3039       String rsPath = ZKUtil.joinZNode(nodePath, failedServer);
3040       Stat stat = new Stat();
3041       ZKUtil.getDataNoWatch(zkw, rsPath, stat);
3042       if (maxZxid < stat.getCzxid()) {
3043         maxZxid = stat.getCzxid();
3044         result = failedServer;
3045       }
3046     }
3047     return result;
3048   }
3049 
3050   public CoprocessorServiceResponse execRegionServerService(final RpcController controller,
3051       final CoprocessorServiceRequest serviceRequest) throws ServiceException {
3052     try {
3053       ServerRpcController execController = new ServerRpcController();
3054       CoprocessorServiceCall call = serviceRequest.getCall();
3055       String serviceName = call.getServiceName();
3056       String methodName = call.getMethodName();
3057       if (!coprocessorServiceHandlers.containsKey(serviceName)) {
3058         throw new UnknownProtocolException(null,
3059             "No registered coprocessor service found for name " + serviceName);
3060       }
3061       Service service = coprocessorServiceHandlers.get(serviceName);
3062       Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
3063       Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
3064       if (methodDesc == null) {
3065         throw new UnknownProtocolException(service.getClass(), "Unknown method " + methodName
3066             + " called on service " + serviceName);
3067       }
3068       Message request =
3069           service.getRequestPrototype(methodDesc).newBuilderForType().mergeFrom(call.getRequest())
3070               .build();
3071       final Message.Builder responseBuilder =
3072           service.getResponsePrototype(methodDesc).newBuilderForType();
3073       service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
3074         @Override
3075         public void run(Message message) {
3076           if (message != null) {
3077             responseBuilder.mergeFrom(message);
3078           }
3079         }
3080       });
3081       Message execResult = responseBuilder.build();
3082       if (execController.getFailedOn() != null) {
3083         throw execController.getFailedOn();
3084       }
3085       ClientProtos.CoprocessorServiceResponse.Builder builder =
3086           ClientProtos.CoprocessorServiceResponse.newBuilder();
3087       builder.setRegion(RequestConverter.buildRegionSpecifier(RegionSpecifierType.REGION_NAME,
3088         HConstants.EMPTY_BYTE_ARRAY));
3089       builder.setValue(builder.getValueBuilder().setName(execResult.getClass().getName())
3090           .setValue(execResult.toByteString()));
3091       return builder.build();
3092     } catch (IOException ie) {
3093       throw new ServiceException(ie);
3094     }
3095   }
3096 
3097   /**
3098    * @return The cache config instance used by the regionserver.
3099    */
3100   public CacheConfig getCacheConfig() {
3101     return this.cacheConfig;
3102   }
3103 
3104   /**
3105    * @return : Returns the ConfigurationManager object for testing purposes.
3106    */
3107   protected ConfigurationManager getConfigurationManager() {
3108     return configurationManager;
3109   }
3110 
3111   /**
3112    * Reload the configuration from disk.
3113    */
3114   public void updateConfiguration() {
3115     LOG.info("Reloading the configuration from disk.");
3116     // Reload the configuration from disk.
3117     conf.reloadConfiguration();
3118     configurationManager.notifyAllObservers(conf);
3119   }
3120 
3121   @Override
3122   public HeapMemoryManager getHeapMemoryManager() {
3123     return hMemManager;
3124   }
3125 }