View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.IOException;
22  import java.io.InterruptedIOException;
23  import java.lang.Thread.UncaughtExceptionHandler;
24  import java.lang.management.ManagementFactory;
25  import java.lang.management.MemoryUsage;
26  import java.lang.reflect.Constructor;
27  import java.net.BindException;
28  import java.net.InetAddress;
29  import java.net.InetSocketAddress;
30  import java.util.ArrayList;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.Comparator;
34  import java.util.HashMap;
35  import java.util.HashSet;
36  import java.util.Iterator;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Map.Entry;
40  import java.util.Set;
41  import java.util.SortedMap;
42  import java.util.TreeMap;
43  import java.util.TreeSet;
44  import java.util.concurrent.ConcurrentHashMap;
45  import java.util.concurrent.ConcurrentMap;
46  import java.util.concurrent.ConcurrentSkipListMap;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  import java.util.concurrent.atomic.AtomicReference;
49  import java.util.concurrent.locks.ReentrantReadWriteLock;
50  
51  import javax.management.MalformedObjectNameException;
52  import javax.management.ObjectName;
53  import javax.servlet.http.HttpServlet;
54  
55  import org.apache.commons.lang.math.RandomUtils;
56  import org.apache.commons.logging.Log;
57  import org.apache.commons.logging.LogFactory;
58  import org.apache.hadoop.conf.Configuration;
59  import org.apache.hadoop.fs.FileSystem;
60  import org.apache.hadoop.fs.Path;
61  import org.apache.hadoop.hbase.Chore;
62  import org.apache.hadoop.hbase.ClockOutOfSyncException;
63  import org.apache.hadoop.hbase.CoordinatedStateManager;
64  import org.apache.hadoop.hbase.CoordinatedStateManagerFactory;
65  import org.apache.hadoop.hbase.HBaseConfiguration;
66  import org.apache.hadoop.hbase.HBaseInterfaceAudience;
67  import org.apache.hadoop.hbase.HConstants;
68  import org.apache.hadoop.hbase.HRegionInfo;
69  import org.apache.hadoop.hbase.HealthCheckChore;
70  import org.apache.hadoop.hbase.MetaTableAccessor;
71  import org.apache.hadoop.hbase.NotServingRegionException;
72  import org.apache.hadoop.hbase.RemoteExceptionHandler;
73  import org.apache.hadoop.hbase.ServerName;
74  import org.apache.hadoop.hbase.Stoppable;
75  import org.apache.hadoop.hbase.TableDescriptors;
76  import org.apache.hadoop.hbase.TableName;
77  import org.apache.hadoop.hbase.YouAreDeadException;
78  import org.apache.hadoop.hbase.ZNodeClearer;
79  import org.apache.hadoop.hbase.classification.InterfaceAudience;
80  import org.apache.hadoop.hbase.client.ClusterConnection;
81  import org.apache.hadoop.hbase.client.ConnectionFactory;
82  import org.apache.hadoop.hbase.client.ConnectionUtils;
83  import org.apache.hadoop.hbase.conf.ConfigurationManager;
84  import org.apache.hadoop.hbase.coordination.BaseCoordinatedStateManager;
85  import org.apache.hadoop.hbase.coordination.CloseRegionCoordination;
86  import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination;
87  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
88  import org.apache.hadoop.hbase.exceptions.RegionMovedException;
89  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
90  import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
91  import org.apache.hadoop.hbase.executor.ExecutorService;
92  import org.apache.hadoop.hbase.executor.ExecutorType;
93  import org.apache.hadoop.hbase.fs.HFileSystem;
94  import org.apache.hadoop.hbase.http.InfoServer;
95  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
96  import org.apache.hadoop.hbase.ipc.RpcClient;
97  import org.apache.hadoop.hbase.ipc.RpcClientFactory;
98  import org.apache.hadoop.hbase.ipc.RpcServerInterface;
99  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
100 import org.apache.hadoop.hbase.ipc.ServerRpcController;
101 import org.apache.hadoop.hbase.master.HMaster;
102 import org.apache.hadoop.hbase.master.RegionState.State;
103 import org.apache.hadoop.hbase.master.TableLockManager;
104 import org.apache.hadoop.hbase.procedure.RegionServerProcedureManagerHost;
105 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
106 import org.apache.hadoop.hbase.protobuf.RequestConverter;
107 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
108 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
109 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceRequest;
110 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceResponse;
111 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos;
112 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
113 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.Coprocessor;
114 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
115 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionServerInfo;
116 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier;
117 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionSpecifier.RegionSpecifierType;
118 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
119 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
120 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest;
121 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
122 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStatusService;
123 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
124 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
125 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRSFatalErrorRequest;
126 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
127 import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
128 import org.apache.hadoop.hbase.regionserver.compactions.CompactionProgress;
129 import org.apache.hadoop.hbase.regionserver.handler.CloseMetaHandler;
130 import org.apache.hadoop.hbase.regionserver.handler.CloseRegionHandler;
131 import org.apache.hadoop.hbase.wal.DefaultWALProvider;
132 import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL;
133 import org.apache.hadoop.hbase.wal.WAL;
134 import org.apache.hadoop.hbase.wal.WALFactory;
135 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
136 import org.apache.hadoop.hbase.replication.regionserver.ReplicationLoad;
137 import org.apache.hadoop.hbase.security.UserProvider;
138 import org.apache.hadoop.hbase.trace.SpanReceiverHost;
139 import org.apache.hadoop.hbase.util.Addressing;
140 import org.apache.hadoop.hbase.util.ByteStringer;
141 import org.apache.hadoop.hbase.util.Bytes;
142 import org.apache.hadoop.hbase.util.CompressionTest;
143 import org.apache.hadoop.hbase.util.ConfigUtil;
144 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
145 import org.apache.hadoop.hbase.util.FSTableDescriptors;
146 import org.apache.hadoop.hbase.util.FSUtils;
147 import org.apache.hadoop.hbase.util.HasThread;
148 import org.apache.hadoop.hbase.util.JSONBean;
149 import org.apache.hadoop.hbase.util.JvmPauseMonitor;
150 import org.apache.hadoop.hbase.util.Sleeper;
151 import org.apache.hadoop.hbase.util.Threads;
152 import org.apache.hadoop.hbase.util.VersionInfo;
153 import org.apache.hadoop.hbase.zookeeper.ClusterStatusTracker;
154 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
155 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
156 import org.apache.hadoop.hbase.zookeeper.RecoveringRegionWatcher;
157 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
158 import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
159 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
160 import org.apache.hadoop.hbase.zookeeper.ZooKeeperNodeTracker;
161 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
162 import org.apache.hadoop.ipc.RemoteException;
163 import org.apache.hadoop.metrics.util.MBeanUtil;
164 import org.apache.hadoop.util.ReflectionUtils;
165 import org.apache.hadoop.util.StringUtils;
166 import org.apache.zookeeper.KeeperException;
167 import org.apache.zookeeper.KeeperException.NoNodeException;
168 import org.apache.zookeeper.data.Stat;
169 
170 import com.google.common.annotations.VisibleForTesting;
171 import com.google.common.collect.Maps;
172 import com.google.protobuf.BlockingRpcChannel;
173 import com.google.protobuf.Descriptors;
174 import com.google.protobuf.Message;
175 import com.google.protobuf.RpcCallback;
176 import com.google.protobuf.RpcController;
177 import com.google.protobuf.Service;
178 import com.google.protobuf.ServiceException;
179 
180 /**
181  * HRegionServer makes a set of HRegions available to clients. It checks in with
182  * the HMaster. There are many HRegionServers in a single HBase deployment.
183  */
184 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
185 @SuppressWarnings("deprecation")
186 public class HRegionServer extends HasThread implements
187     RegionServerServices, LastSequenceId {
188 
189   public static final Log LOG = LogFactory.getLog(HRegionServer.class);
190 
191   /*
192    * Strings to be used in forming the exception message for
193    * RegionsAlreadyInTransitionException.
194    */
195   protected static final String OPEN = "OPEN";
196   protected static final String CLOSE = "CLOSE";
197 
198   //RegionName vs current action in progress
199   //true - if open region action in progress
200   //false - if close region action in progress
201   protected final ConcurrentMap<byte[], Boolean> regionsInTransitionInRS =
202     new ConcurrentSkipListMap<byte[], Boolean>(Bytes.BYTES_COMPARATOR);
203 
204   // Cache flushing
205   protected MemStoreFlusher cacheFlusher;
206 
207   protected HeapMemoryManager hMemManager;
208 
209   /**
210    * Cluster connection to be shared by services.
211    * Initialized at server startup and closed when server shuts down.
212    * Clients must never close it explicitly.
213    */
214   protected ClusterConnection clusterConnection;
215 
216   /*
217    * Long-living meta table locator, which is created when the server is started and stopped
218    * when server shuts down. References to this locator shall be used to perform according
219    * operations in EventHandlers. Primary reason for this decision is to make it mockable
220    * for tests.
221    */
222   protected MetaTableLocator metaTableLocator;
223 
224   // Watch if a region is out of recovering state from ZooKeeper
225   @SuppressWarnings("unused")
226   private RecoveringRegionWatcher recoveringRegionWatcher;
227 
228   /**
229    * Go here to get table descriptors.
230    */
231   protected TableDescriptors tableDescriptors;
232 
233   // Replication services. If no replication, this handler will be null.
234   protected ReplicationSourceService replicationSourceHandler;
235   protected ReplicationSinkService replicationSinkHandler;
236 
237   // Compactions
238   public CompactSplitThread compactSplitThread;
239 
240   /**
241    * Map of regions currently being served by this region server. Key is the
242    * encoded region name.  All access should be synchronized.
243    */
244   protected final Map<String, HRegion> onlineRegions =
245     new ConcurrentHashMap<String, HRegion>();
246 
247   /**
248    * Map of encoded region names to the DataNode locations they should be hosted on
249    * We store the value as InetSocketAddress since this is used only in HDFS
250    * API (create() that takes favored nodes as hints for placing file blocks).
251    * We could have used ServerName here as the value class, but we'd need to
252    * convert it to InetSocketAddress at some point before the HDFS API call, and
253    * it seems a bit weird to store ServerName since ServerName refers to RegionServers
254    * and here we really mean DataNode locations.
255    */
256   protected final Map<String, InetSocketAddress[]> regionFavoredNodesMap =
257       new ConcurrentHashMap<String, InetSocketAddress[]>();
258 
259   /**
260    * Set of regions currently being in recovering state which means it can accept writes(edits from
261    * previous failed region server) but not reads. A recovering region is also an online region.
262    */
263   protected final Map<String, HRegion> recoveringRegions = Collections
264       .synchronizedMap(new HashMap<String, HRegion>());
265 
266   // Leases
267   protected Leases leases;
268 
269   // Instance of the hbase executor service.
270   protected ExecutorService service;
271 
272   // If false, the file system has become unavailable
273   protected volatile boolean fsOk;
274   protected HFileSystem fs;
275 
276   // Set when a report to the master comes back with a message asking us to
277   // shutdown. Also set by call to stop when debugging or running unit tests
278   // of HRegionServer in isolation.
279   private volatile boolean stopped = false;
280 
281   // Go down hard. Used if file system becomes unavailable and also in
282   // debugging and unit tests.
283   private volatile boolean abortRequested;
284 
285   ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<String, Integer>();
286 
287   // A state before we go into stopped state.  At this stage we're closing user
288   // space regions.
289   private boolean stopping = false;
290 
291   private volatile boolean killed = false;
292 
293   protected final Configuration conf;
294 
295   private Path rootDir;
296 
297   protected final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
298 
299   final int numRetries;
300   protected final int threadWakeFrequency;
301   protected final int msgInterval;
302 
303   protected final int numRegionsToReport;
304 
305   // Stub to do region server status calls against the master.
306   private volatile RegionServerStatusService.BlockingInterface rssStub;
307   // RPC client. Used to make the stub above that does region server status checking.
308   RpcClient rpcClient;
309 
310   private UncaughtExceptionHandler uncaughtExceptionHandler;
311 
312   // Info server. Default access so can be used by unit tests. REGIONSERVER
313   // is name of the webapp and the attribute name used stuffing this instance
314   // into web context.
315   protected InfoServer infoServer;
316   private JvmPauseMonitor pauseMonitor;
317 
318   /** region server process name */
319   public static final String REGIONSERVER = "regionserver";
320 
321   MetricsRegionServer metricsRegionServer;
322   private SpanReceiverHost spanReceiverHost;
323 
324   /*
325    * Check for compactions requests.
326    */
327   Chore compactionChecker;
328 
329   /*
330    * Check for flushes
331    */
332   Chore periodicFlusher;
333 
334   protected volatile WALFactory walFactory;
335 
336   // WAL roller. log is protected rather than private to avoid
337   // eclipse warning when accessed by inner classes
338   final LogRoller walRoller;
339   // Lazily initialized if this RegionServer hosts a meta table.
340   final AtomicReference<LogRoller> metawalRoller = new AtomicReference<LogRoller>();
341 
342   // flag set after we're done setting up server threads
343   final AtomicBoolean online = new AtomicBoolean(false);
344 
345   // zookeeper connection and watcher
346   protected ZooKeeperWatcher zooKeeper;
347 
348   // master address tracker
349   private MasterAddressTracker masterAddressTracker;
350 
351   // Cluster Status Tracker
352   protected ClusterStatusTracker clusterStatusTracker;
353 
354   // Log Splitting Worker
355   private SplitLogWorker splitLogWorker;
356 
357   // A sleeper that sleeps for msgInterval.
358   protected final Sleeper sleeper;
359 
360   private final int operationTimeout;
361 
362   private final RegionServerAccounting regionServerAccounting;
363 
364   // Cache configuration and block cache reference
365   protected CacheConfig cacheConfig;
366 
367   /** The health check chore. */
368   private HealthCheckChore healthCheckChore;
369 
370   /** The nonce manager chore. */
371   private Chore nonceManagerChore;
372 
373   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
374 
375   /**
376    * The server name the Master sees us as.  Its made from the hostname the
377    * master passes us, port, and server startcode. Gets set after registration
378    * against  Master.
379    */
380   protected ServerName serverName;
381 
382   /**
383    * This servers startcode.
384    */
385   protected final long startcode;
386 
387   /**
388    * Unique identifier for the cluster we are a part of.
389    */
390   private String clusterId;
391 
392   /**
393    * MX Bean for RegionServerInfo
394    */
395   private ObjectName mxBean = null;
396 
397   /**
398    * Chore to clean periodically the moved region list
399    */
400   private MovedRegionsCleaner movedRegionsCleaner;
401 
402   // chore for refreshing store files for secondary regions
403   private StorefileRefresherChore storefileRefresher;
404 
405   private RegionServerCoprocessorHost rsHost;
406 
407   private RegionServerProcedureManagerHost rspmHost;
408 
409   // Table level lock manager for locking for region operations
410   protected TableLockManager tableLockManager;
411 
412   /**
413    * Nonce manager. Nonces are used to make operations like increment and append idempotent
414    * in the case where client doesn't receive the response from a successful operation and
415    * retries. We track the successful ops for some time via a nonce sent by client and handle
416    * duplicate operations (currently, by failing them; in future we might use MVCC to return
417    * result). Nonces are also recovered from WAL during, recovery; however, the caveats (from
418    * HBASE-3787) are:
419    * - WAL recovery is optimized, and under high load we won't read nearly nonce-timeout worth
420    *   of past records. If we don't read the records, we don't read and recover the nonces.
421    *   Some WALs within nonce-timeout at recovery may not even be present due to rolling/cleanup.
422    * - There's no WAL recovery during normal region move, so nonces will not be transfered.
423    * We can have separate additional "Nonce WAL". It will just contain bunch of numbers and
424    * won't be flushed on main path - because WAL itself also contains nonces, if we only flush
425    * it before memstore flush, for a given nonce we will either see it in the WAL (if it was
426    * never flushed to disk, it will be part of recovery), or we'll see it as part of the nonce
427    * log (or both occasionally, which doesn't matter). Nonce log file can be deleted after the
428    * latest nonce in it expired. It can also be recovered during move.
429    */
430   final ServerNonceManager nonceManager;
431 
432   private UserProvider userProvider;
433 
434   protected final RSRpcServices rpcServices;
435 
436   protected BaseCoordinatedStateManager csm;
437 
438   private final boolean useZKForAssignment;
439 
440   /**
441    * Configuration manager is used to register/deregister and notify the configuration observers
442    * when the regionserver is notified that there was a change in the on disk configs.
443    */
444   protected final ConfigurationManager configurationManager;
445 
446   /**
447    * Starts a HRegionServer at the default location.
448    * @param conf
449    * @throws IOException
450    * @throws InterruptedException
451    */
452   public HRegionServer(Configuration conf) throws IOException, InterruptedException {
453     this(conf, CoordinatedStateManagerFactory.getCoordinatedStateManager(conf));
454   }
455 
456   /**
457    * Starts a HRegionServer at the default location
458    * @param conf
459    * @param csm implementation of CoordinatedStateManager to be used
460    * @throws IOException
461    * @throws InterruptedException
462    */
463   public HRegionServer(Configuration conf, CoordinatedStateManager csm)
464       throws IOException, InterruptedException {
465     this.fsOk = true;
466     this.conf = conf;
467     checkCodecs(this.conf);
468     this.userProvider = UserProvider.instantiate(conf);
469     FSUtils.setupShortCircuitRead(this.conf);
470 
471     // Config'ed params
472     this.numRetries = this.conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
473         HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
474     this.threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
475     this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
476 
477     this.sleeper = new Sleeper(this.msgInterval, this);
478 
479     boolean isNoncesEnabled = conf.getBoolean(HConstants.HBASE_RS_NONCES_ENABLED, true);
480     this.nonceManager = isNoncesEnabled ? new ServerNonceManager(this.conf) : null;
481 
482     this.numRegionsToReport = conf.getInt(
483       "hbase.regionserver.numregionstoreport", 10);
484 
485     this.operationTimeout = conf.getInt(
486       HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY,
487       HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT);
488 
489     this.abortRequested = false;
490     this.stopped = false;
491 
492     rpcServices = createRpcServices();
493     this.startcode = System.currentTimeMillis();
494     String hostName = rpcServices.isa.getHostName();
495     serverName = ServerName.valueOf(hostName, rpcServices.isa.getPort(), startcode);
496 
497     // login the zookeeper client principal (if using security)
498     ZKUtil.loginClient(this.conf, "hbase.zookeeper.client.keytab.file",
499       "hbase.zookeeper.client.kerberos.principal", hostName);
500     // login the server principal (if using secure Hadoop)
501     login(userProvider, hostName);
502 
503     regionServerAccounting = new RegionServerAccounting();
504     uncaughtExceptionHandler = new UncaughtExceptionHandler() {
505       @Override
506       public void uncaughtException(Thread t, Throwable e) {
507         abort("Uncaught exception in service thread " + t.getName(), e);
508       }
509     };
510 
511     useZKForAssignment = ConfigUtil.useZKForAssignment(conf);
512 
513     // Set 'fs.defaultFS' to match the filesystem on hbase.rootdir else
514     // underlying hadoop hdfs accessors will be going against wrong filesystem
515     // (unless all is set to defaults).
516     FSUtils.setFsDefault(this.conf, FSUtils.getRootDir(this.conf));
517     // Get fs instance used by this RS.  Do we use checksum verification in the hbase? If hbase
518     // checksum verification enabled, then automatically switch off hdfs checksum verification.
519     boolean useHBaseChecksum = conf.getBoolean(HConstants.HBASE_CHECKSUM_VERIFICATION, true);
520     this.fs = new HFileSystem(this.conf, useHBaseChecksum);
521     this.rootDir = FSUtils.getRootDir(this.conf);
522     this.tableDescriptors = new FSTableDescriptors(
523       this.conf, this.fs, this.rootDir, !canUpdateTableDescriptor(), false);
524 
525     service = new ExecutorService(getServerName().toShortString());
526     spanReceiverHost = SpanReceiverHost.getInstance(getConfiguration());
527 
528     // Some unit tests don't need a cluster, so no zookeeper at all
529     if (!conf.getBoolean("hbase.testing.nocluster", false)) {
530       // Open connection to zookeeper and set primary watcher
531       zooKeeper = new ZooKeeperWatcher(conf, getProcessName() + ":" +
532         rpcServices.isa.getPort(), this, canCreateBaseZNode());
533 
534       this.csm = (BaseCoordinatedStateManager) csm;
535       this.csm.initialize(this);
536       this.csm.start();
537 
538       tableLockManager = TableLockManager.createTableLockManager(
539         conf, zooKeeper, serverName);
540 
541       masterAddressTracker = new MasterAddressTracker(getZooKeeper(), this);
542       masterAddressTracker.start();
543 
544       clusterStatusTracker = new ClusterStatusTracker(zooKeeper, this);
545       clusterStatusTracker.start();
546     }
547     this.configurationManager = new ConfigurationManager();
548 
549     rpcServices.start();
550     putUpWebUI();
551     this.walRoller = new LogRoller(this, this);
552   }
553 
554   protected void login(UserProvider user, String host) throws IOException {
555     user.login("hbase.regionserver.keytab.file",
556       "hbase.regionserver.kerberos.principal", host);
557   }
558 
559   protected void waitForMasterActive(){
560   }
561 
562   protected String getProcessName() {
563     return REGIONSERVER;
564   }
565 
566   protected boolean canCreateBaseZNode() {
567     return false;
568   }
569 
570   protected boolean canUpdateTableDescriptor() {
571     return false;
572   }
573 
574   protected RSRpcServices createRpcServices() throws IOException {
575     return new RSRpcServices(this);
576   }
577 
578   protected void configureInfoServer() {
579     infoServer.addServlet("rs-status", "/rs-status", RSStatusServlet.class);
580     infoServer.setAttribute(REGIONSERVER, this);
581   }
582 
583   protected Class<? extends HttpServlet> getDumpServlet() {
584     return RSDumpServlet.class;
585   }
586 
587   protected void doMetrics() {
588   }
589 
590   @Override
591   public boolean registerService(Service instance) {
592     /*
593      * No stacking of instances is allowed for a single service name
594      */
595     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
596     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
597       LOG.error("Coprocessor service " + serviceDesc.getFullName()
598           + " already registered, rejecting request from " + instance);
599       return false;
600     }
601 
602     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
603     if (LOG.isDebugEnabled()) {
604       LOG.debug("Registered regionserver coprocessor service: service=" + serviceDesc.getFullName());
605     }
606     return true;
607   }
608 
609   /**
610    * Create a 'smarter' HConnection, one that is capable of by-passing RPC if the request is to
611    * the local server.  Safe to use going to local or remote server.
612    * Create this instance in a method can be intercepted and mocked in tests.
613    * @throws IOException
614    */
615   @VisibleForTesting
616   protected ClusterConnection createClusterConnection() throws IOException {
617     // Create a cluster connection that when appropriate, can short-circuit and go directly to the
618     // local server if the request is to the local server bypassing RPC. Can be used for both local
619     // and remote invocations.
620     return ConnectionUtils.createShortCircuitHConnection(
621       ConnectionFactory.createConnection(conf), serverName, rpcServices, rpcServices);
622   }
623 
624   /**
625    * Run test on configured codecs to make sure supporting libs are in place.
626    * @param c
627    * @throws IOException
628    */
629   private static void checkCodecs(final Configuration c) throws IOException {
630     // check to see if the codec list is available:
631     String [] codecs = c.getStrings("hbase.regionserver.codecs", (String[])null);
632     if (codecs == null) return;
633     for (String codec : codecs) {
634       if (!CompressionTest.testCompression(codec)) {
635         throw new IOException("Compression codec " + codec +
636           " not supported, aborting RS construction");
637       }
638     }
639   }
640 
641   public String getClusterId() {
642     return this.clusterId;
643   }
644 
645   /**
646    * Setup our cluster connection if not already initialized.
647    * @throws IOException
648    */
649   protected synchronized void setupClusterConnection() throws IOException {
650     if (clusterConnection == null) {
651       clusterConnection = createClusterConnection();
652       metaTableLocator = new MetaTableLocator();
653     }
654   }
655 
656   /**
657    * All initialization needed before we go register with Master.
658    *
659    * @throws IOException
660    * @throws InterruptedException
661    */
662   private void preRegistrationInitialization(){
663     try {
664       setupClusterConnection();
665 
666       // Health checker thread.
667       if (isHealthCheckerConfigured()) {
668         int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
669           HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
670         healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
671       }
672       this.pauseMonitor = new JvmPauseMonitor(conf);
673       pauseMonitor.start();
674 
675       initializeZooKeeper();
676       if (!isStopped() && !isAborted()) {
677         initializeThreads();
678       }
679     } catch (Throwable t) {
680       // Call stop if error or process will stick around for ever since server
681       // puts up non-daemon threads.
682       this.rpcServices.stop();
683       abort("Initialization of RS failed.  Hence aborting RS.", t);
684     }
685   }
686 
687   /**
688    * Bring up connection to zk ensemble and then wait until a master for this
689    * cluster and then after that, wait until cluster 'up' flag has been set.
690    * This is the order in which master does things.
691    * Finally open long-living server short-circuit connection.
692    * @throws IOException
693    * @throws InterruptedException
694    */
695   private void initializeZooKeeper() throws IOException, InterruptedException {
696     // Create the master address tracker, register with zk, and start it.  Then
697     // block until a master is available.  No point in starting up if no master
698     // running.
699     blockAndCheckIfStopped(this.masterAddressTracker);
700 
701     // Wait on cluster being up.  Master will set this flag up in zookeeper
702     // when ready.
703     blockAndCheckIfStopped(this.clusterStatusTracker);
704 
705     // Retrieve clusterId
706     // Since cluster status is now up
707     // ID should have already been set by HMaster
708     try {
709       clusterId = ZKClusterId.readClusterIdZNode(this.zooKeeper);
710       if (clusterId == null) {
711         this.abort("Cluster ID has not been set");
712       }
713       LOG.info("ClusterId : "+clusterId);
714     } catch (KeeperException e) {
715       this.abort("Failed to retrieve Cluster ID",e);
716     }
717 
718     // In case colocated master, wait here till it's active.
719     // So backup masters won't start as regionservers.
720     // This is to avoid showing backup masters as regionservers
721     // in master web UI, or assigning any region to them.
722     waitForMasterActive();
723     if (isStopped() || isAborted()) {
724       return; // No need for further initialization
725     }
726 
727     // watch for snapshots and other procedures
728     try {
729       rspmHost = new RegionServerProcedureManagerHost();
730       rspmHost.loadProcedures(conf);
731       rspmHost.initialize(this);
732     } catch (KeeperException e) {
733       this.abort("Failed to reach zk cluster when creating procedure handler.", e);
734     }
735     // register watcher for recovering regions
736     this.recoveringRegionWatcher = new RecoveringRegionWatcher(this.zooKeeper, this);
737   }
738 
739   /**
740    * Utilty method to wait indefinitely on a znode availability while checking
741    * if the region server is shut down
742    * @param tracker znode tracker to use
743    * @throws IOException any IO exception, plus if the RS is stopped
744    * @throws InterruptedException
745    */
746   private void blockAndCheckIfStopped(ZooKeeperNodeTracker tracker)
747       throws IOException, InterruptedException {
748     while (tracker.blockUntilAvailable(this.msgInterval, false) == null) {
749       if (this.stopped) {
750         throw new IOException("Received the shutdown message while waiting.");
751       }
752     }
753   }
754 
755   /**
756    * @return False if cluster shutdown in progress
757    */
758   private boolean isClusterUp() {
759     return clusterStatusTracker != null && clusterStatusTracker.isClusterUp();
760   }
761 
762   private void initializeThreads() throws IOException {
763     // Cache flushing thread.
764     this.cacheFlusher = new MemStoreFlusher(conf, this);
765 
766     // Compaction thread
767     this.compactSplitThread = new CompactSplitThread(this);
768 
769     // Background thread to check for compactions; needed if region has not gotten updates
770     // in a while. It will take care of not checking too frequently on store-by-store basis.
771     this.compactionChecker = new CompactionChecker(this, this.threadWakeFrequency, this);
772     this.periodicFlusher = new PeriodicMemstoreFlusher(this.threadWakeFrequency, this);
773     this.leases = new Leases(this.threadWakeFrequency);
774 
775     // Create the thread to clean the moved regions list
776     movedRegionsCleaner = MovedRegionsCleaner.createAndStart(this);
777 
778     if (this.nonceManager != null) {
779       // Create the chore that cleans up nonces.
780       nonceManagerChore = this.nonceManager.createCleanupChore(this);
781     }
782 
783     // Setup RPC client for master communication
784     rpcClient = RpcClientFactory.createClient(conf, clusterId, new InetSocketAddress(
785         rpcServices.isa.getAddress(), 0));
786 
787     int storefileRefreshPeriod = conf.getInt(
788         StorefileRefresherChore.REGIONSERVER_STOREFILE_REFRESH_PERIOD
789       , StorefileRefresherChore.DEFAULT_REGIONSERVER_STOREFILE_REFRESH_PERIOD);
790     if (storefileRefreshPeriod > 0) {
791       this.storefileRefresher = new StorefileRefresherChore(storefileRefreshPeriod, this, this);
792     }
793     registerConfigurationObservers();
794   }
795 
796   private void registerConfigurationObservers() {
797     // Registering the compactSplitThread object with the ConfigurationManager.
798     configurationManager.registerObserver(this.compactSplitThread);
799   }
800 
801   /**
802    * The HRegionServer sticks in this loop until closed.
803    */
804   @Override
805   public void run() {
806     try {
807       // Do pre-registration initializations; zookeeper, lease threads, etc.
808       preRegistrationInitialization();
809     } catch (Throwable e) {
810       abort("Fatal exception during initialization", e);
811     }
812 
813     try {
814       if (!isStopped() && !isAborted()) {
815         ShutdownHook.install(conf, fs, this, Thread.currentThread());
816         // Set our ephemeral znode up in zookeeper now we have a name.
817         createMyEphemeralNode();
818         // Initialize the RegionServerCoprocessorHost now that our ephemeral
819         // node was created, in case any coprocessors want to use ZooKeeper
820         this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
821       }
822 
823       // Try and register with the Master; tell it we are here.  Break if
824       // server is stopped or the clusterup flag is down or hdfs went wacky.
825       while (keepLooping()) {
826         RegionServerStartupResponse w = reportForDuty();
827         if (w == null) {
828           LOG.warn("reportForDuty failed; sleeping and then retrying.");
829           this.sleeper.sleep();
830         } else {
831           handleReportForDutyResponse(w);
832           break;
833         }
834       }
835 
836       if (!isStopped() && isHealthy()){
837         // start the snapshot handler and other procedure handlers,
838         // since the server is ready to run
839         rspmHost.start();
840       }
841 
842       // We registered with the Master.  Go into run mode.
843       long lastMsg = System.currentTimeMillis();
844       long oldRequestCount = -1;
845       // The main run loop.
846       while (!isStopped() && isHealthy()) {
847         if (!isClusterUp()) {
848           if (isOnlineRegionsEmpty()) {
849             stop("Exiting; cluster shutdown set and not carrying any regions");
850           } else if (!this.stopping) {
851             this.stopping = true;
852             LOG.info("Closing user regions");
853             closeUserRegions(this.abortRequested);
854           } else if (this.stopping) {
855             boolean allUserRegionsOffline = areAllUserRegionsOffline();
856             if (allUserRegionsOffline) {
857               // Set stopped if no more write requests tp meta tables
858               // since last time we went around the loop.  Any open
859               // meta regions will be closed on our way out.
860               if (oldRequestCount == getWriteRequestCount()) {
861                 stop("Stopped; only catalog regions remaining online");
862                 break;
863               }
864               oldRequestCount = getWriteRequestCount();
865             } else {
866               // Make sure all regions have been closed -- some regions may
867               // have not got it because we were splitting at the time of
868               // the call to closeUserRegions.
869               closeUserRegions(this.abortRequested);
870             }
871             LOG.debug("Waiting on " + getOnlineRegionsAsPrintableString());
872           }
873         }
874         long now = System.currentTimeMillis();
875         if ((now - lastMsg) >= msgInterval) {
876           tryRegionServerReport(lastMsg, now);
877           lastMsg = System.currentTimeMillis();
878           doMetrics();
879         }
880         if (!isStopped() && !isAborted()) {
881           this.sleeper.sleep();
882         }
883       } // for
884     } catch (Throwable t) {
885       if (!rpcServices.checkOOME(t)) {
886         String prefix = t instanceof YouAreDeadException? "": "Unhandled: ";
887         abort(prefix + t.getMessage(), t);
888       }
889     }
890     // Run shutdown.
891     if (mxBean != null) {
892       MBeanUtil.unregisterMBean(mxBean);
893       mxBean = null;
894     }
895     if (this.leases != null) this.leases.closeAfterLeasesExpire();
896     if (this.splitLogWorker != null) {
897       splitLogWorker.stop();
898     }
899     if (this.infoServer != null) {
900       LOG.info("Stopping infoServer");
901       try {
902         this.infoServer.stop();
903       } catch (Exception e) {
904         LOG.error("Failed to stop infoServer", e);
905       }
906     }
907     // Send cache a shutdown.
908     if (cacheConfig != null && cacheConfig.isBlockCacheEnabled()) {
909       cacheConfig.getBlockCache().shutdown();
910     }
911 
912     if (movedRegionsCleaner != null) {
913       movedRegionsCleaner.stop("Region Server stopping");
914     }
915 
916     // Send interrupts to wake up threads if sleeping so they notice shutdown.
917     // TODO: Should we check they are alive? If OOME could have exited already
918     if(this.hMemManager != null) this.hMemManager.stop();
919     if (this.cacheFlusher != null) this.cacheFlusher.interruptIfNecessary();
920     if (this.compactSplitThread != null) this.compactSplitThread.interruptIfNecessary();
921     if (this.compactionChecker != null)
922       this.compactionChecker.interrupt();
923     if (this.healthCheckChore != null) {
924       this.healthCheckChore.interrupt();
925     }
926     if (this.nonceManagerChore != null) {
927       this.nonceManagerChore.interrupt();
928     }
929     if (this.storefileRefresher != null) {
930       this.storefileRefresher.interrupt();
931     }
932 
933     // Stop the snapshot and other procedure handlers, forcefully killing all running tasks
934     if (rspmHost != null) {
935       rspmHost.stop(this.abortRequested || this.killed);
936     }
937 
938     if (this.killed) {
939       // Just skip out w/o closing regions.  Used when testing.
940     } else if (abortRequested) {
941       if (this.fsOk) {
942         closeUserRegions(abortRequested); // Don't leave any open file handles
943       }
944       LOG.info("aborting server " + this.serverName);
945     } else {
946       closeUserRegions(abortRequested);
947       LOG.info("stopping server " + this.serverName);
948     }
949 
950     // so callers waiting for meta without timeout can stop
951     if (this.metaTableLocator != null) this.metaTableLocator.stop();
952     if (this.clusterConnection != null && !clusterConnection.isClosed()) {
953       try {
954         this.clusterConnection.close();
955       } catch (IOException e) {
956         // Although the {@link Closeable} interface throws an {@link
957         // IOException}, in reality, the implementation would never do that.
958         LOG.warn("Attempt to close server's short circuit HConnection failed.", e);
959       }
960     }
961 
962     // Closing the compactSplit thread before closing meta regions
963     if (!this.killed && containsMetaTableRegions()) {
964       if (!abortRequested || this.fsOk) {
965         if (this.compactSplitThread != null) {
966           this.compactSplitThread.join();
967           this.compactSplitThread = null;
968         }
969         closeMetaTableRegions(abortRequested);
970       }
971     }
972 
973     if (!this.killed && this.fsOk) {
974       waitOnAllRegionsToClose(abortRequested);
975       LOG.info("stopping server " + this.serverName +
976         "; all regions closed.");
977     }
978 
979     //fsOk flag may be changed when closing regions throws exception.
980     if (this.fsOk) {
981       shutdownWAL(!abortRequested);
982     }
983 
984     // Make sure the proxy is down.
985     if (this.rssStub != null) {
986       this.rssStub = null;
987     }
988     if (this.rpcClient != null) {
989       this.rpcClient.close();
990     }
991     if (this.leases != null) {
992       this.leases.close();
993     }
994     if (this.pauseMonitor != null) {
995       this.pauseMonitor.stop();
996     }
997 
998     if (!killed) {
999       stopServiceThreads();
1000     }
1001 
1002     if (this.rpcServices != null) {
1003       this.rpcServices.stop();
1004     }
1005 
1006     try {
1007       deleteMyEphemeralNode();
1008     } catch (KeeperException.NoNodeException nn) {
1009     } catch (KeeperException e) {
1010       LOG.warn("Failed deleting my ephemeral node", e);
1011     }
1012     // We may have failed to delete the znode at the previous step, but
1013     //  we delete the file anyway: a second attempt to delete the znode is likely to fail again.
1014     ZNodeClearer.deleteMyEphemeralNodeOnDisk();
1015 
1016     if (this.zooKeeper != null) {
1017       this.zooKeeper.close();
1018     }
1019     LOG.info("stopping server " + this.serverName +
1020       "; zookeeper connection closed.");
1021 
1022     LOG.info(Thread.currentThread().getName() + " exiting");
1023   }
1024 
1025   private boolean containsMetaTableRegions() {
1026     return onlineRegions.containsKey(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
1027   }
1028 
1029   private boolean areAllUserRegionsOffline() {
1030     if (getNumberOfOnlineRegions() > 2) return false;
1031     boolean allUserRegionsOffline = true;
1032     for (Map.Entry<String, HRegion> e: this.onlineRegions.entrySet()) {
1033       if (!e.getValue().getRegionInfo().isMetaTable()) {
1034         allUserRegionsOffline = false;
1035         break;
1036       }
1037     }
1038     return allUserRegionsOffline;
1039   }
1040 
1041   /**
1042    * @return Current write count for all online regions.
1043    */
1044   private long getWriteRequestCount() {
1045     int writeCount = 0;
1046     for (Map.Entry<String, HRegion> e: this.onlineRegions.entrySet()) {
1047       writeCount += e.getValue().getWriteRequestsCount();
1048     }
1049     return writeCount;
1050   }
1051 
1052   @VisibleForTesting
1053   protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
1054   throws IOException {
1055     RegionServerStatusService.BlockingInterface rss = rssStub;
1056     if (rss == null) {
1057       // the current server could be stopping.
1058       return;
1059     }
1060     ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
1061     try {
1062       RegionServerReportRequest.Builder request = RegionServerReportRequest.newBuilder();
1063       ServerName sn = ServerName.parseVersionedServerName(
1064         this.serverName.getVersionedBytes());
1065       request.setServer(ProtobufUtil.toServerName(sn));
1066       request.setLoad(sl);
1067       rss.regionServerReport(null, request.build());
1068     } catch (ServiceException se) {
1069       IOException ioe = ProtobufUtil.getRemoteException(se);
1070       if (ioe instanceof YouAreDeadException) {
1071         // This will be caught and handled as a fatal error in run()
1072         throw ioe;
1073       }
1074       if (rssStub == rss) {
1075         rssStub = null;
1076       }
1077       // Couldn't connect to the master, get location from zk and reconnect
1078       // Method blocks until new master is found or we are stopped
1079       createRegionServerStatusStub();
1080     }
1081   }
1082 
1083   ClusterStatusProtos.ServerLoad buildServerLoad(long reportStartTime, long reportEndTime)
1084       throws IOException {
1085     // We're getting the MetricsRegionServerWrapper here because the wrapper computes requests
1086     // per second, and other metrics  As long as metrics are part of ServerLoad it's best to use
1087     // the wrapper to compute those numbers in one place.
1088     // In the long term most of these should be moved off of ServerLoad and the heart beat.
1089     // Instead they should be stored in an HBase table so that external visibility into HBase is
1090     // improved; Additionally the load balancer will be able to take advantage of a more complete
1091     // history.
1092     MetricsRegionServerWrapper regionServerWrapper = this.metricsRegionServer.getRegionServerWrapper();
1093     Collection<HRegion> regions = getOnlineRegionsLocalContext();
1094     MemoryUsage memory =
1095       ManagementFactory.getMemoryMXBean().getHeapMemoryUsage();
1096 
1097     ClusterStatusProtos.ServerLoad.Builder serverLoad =
1098       ClusterStatusProtos.ServerLoad.newBuilder();
1099     serverLoad.setNumberOfRequests((int) regionServerWrapper.getRequestsPerSecond());
1100     serverLoad.setTotalNumberOfRequests((int) regionServerWrapper.getTotalRequestCount());
1101     serverLoad.setUsedHeapMB((int)(memory.getUsed() / 1024 / 1024));
1102     serverLoad.setMaxHeapMB((int) (memory.getMax() / 1024 / 1024));
1103     Set<String> coprocessors = getWAL(null).getCoprocessorHost().getCoprocessors();
1104     for (String coprocessor : coprocessors) {
1105       serverLoad.addCoprocessors(
1106         Coprocessor.newBuilder().setName(coprocessor).build());
1107     }
1108     RegionLoad.Builder regionLoadBldr = RegionLoad.newBuilder();
1109     RegionSpecifier.Builder regionSpecifier = RegionSpecifier.newBuilder();
1110     for (HRegion region : regions) {
1111       serverLoad.addRegionLoads(createRegionLoad(region, regionLoadBldr, regionSpecifier));
1112       for (String coprocessor :
1113           getWAL(region.getRegionInfo()).getCoprocessorHost().getCoprocessors()) {
1114         serverLoad.addCoprocessors(Coprocessor.newBuilder().setName(coprocessor).build());
1115       }
1116     }
1117     serverLoad.setReportStartTime(reportStartTime);
1118     serverLoad.setReportEndTime(reportEndTime);
1119     if (this.infoServer != null) {
1120       serverLoad.setInfoServerPort(this.infoServer.getPort());
1121     } else {
1122       serverLoad.setInfoServerPort(-1);
1123     }
1124 
1125     // for the replicationLoad purpose. Only need to get from one service
1126     // either source or sink will get the same info
1127     ReplicationSourceService rsources = getReplicationSourceService();
1128 
1129     if (rsources != null) {
1130       // always refresh first to get the latest value
1131       ReplicationLoad rLoad = rsources.refreshAndGetReplicationLoad();
1132       if (rLoad != null) {
1133         serverLoad.setReplLoadSink(rLoad.getReplicationLoadSink());
1134         for (ClusterStatusProtos.ReplicationLoadSource rLS : rLoad.getReplicationLoadSourceList()) {
1135           serverLoad.addReplLoadSource(rLS);
1136         }
1137       }
1138     }
1139 
1140     return serverLoad.build();
1141   }
1142 
1143   String getOnlineRegionsAsPrintableString() {
1144     StringBuilder sb = new StringBuilder();
1145     for (HRegion r: this.onlineRegions.values()) {
1146       if (sb.length() > 0) sb.append(", ");
1147       sb.append(r.getRegionInfo().getEncodedName());
1148     }
1149     return sb.toString();
1150   }
1151 
1152   /**
1153    * Wait on regions close.
1154    */
1155   private void waitOnAllRegionsToClose(final boolean abort) {
1156     // Wait till all regions are closed before going out.
1157     int lastCount = -1;
1158     long previousLogTime = 0;
1159     Set<String> closedRegions = new HashSet<String>();
1160     boolean interrupted = false;
1161     try {
1162       while (!isOnlineRegionsEmpty()) {
1163         int count = getNumberOfOnlineRegions();
1164         // Only print a message if the count of regions has changed.
1165         if (count != lastCount) {
1166           // Log every second at most
1167           if (System.currentTimeMillis() > (previousLogTime + 1000)) {
1168             previousLogTime = System.currentTimeMillis();
1169             lastCount = count;
1170             LOG.info("Waiting on " + count + " regions to close");
1171             // Only print out regions still closing if a small number else will
1172             // swamp the log.
1173             if (count < 10 && LOG.isDebugEnabled()) {
1174               LOG.debug(this.onlineRegions);
1175             }
1176           }
1177         }
1178         // Ensure all user regions have been sent a close. Use this to
1179         // protect against the case where an open comes in after we start the
1180         // iterator of onlineRegions to close all user regions.
1181         for (Map.Entry<String, HRegion> e : this.onlineRegions.entrySet()) {
1182           HRegionInfo hri = e.getValue().getRegionInfo();
1183           if (!this.regionsInTransitionInRS.containsKey(hri.getEncodedNameAsBytes())
1184               && !closedRegions.contains(hri.getEncodedName())) {
1185             closedRegions.add(hri.getEncodedName());
1186             // Don't update zk with this close transition; pass false.
1187             closeRegionIgnoreErrors(hri, abort);
1188               }
1189         }
1190         // No regions in RIT, we could stop waiting now.
1191         if (this.regionsInTransitionInRS.isEmpty()) {
1192           if (!isOnlineRegionsEmpty()) {
1193             LOG.info("We were exiting though online regions are not empty," +
1194                 " because some regions failed closing");
1195           }
1196           break;
1197         }
1198         if (sleep(200)) {
1199           interrupted = true;
1200         }
1201       }
1202     } finally {
1203       if (interrupted) {
1204         Thread.currentThread().interrupt();
1205       }
1206     }
1207   }
1208 
1209   private boolean sleep(long millis) {
1210     boolean interrupted = false;
1211     try {
1212       Thread.sleep(millis);
1213     } catch (InterruptedException e) {
1214       LOG.warn("Interrupted while sleeping");
1215       interrupted = true;
1216     }
1217     return interrupted;
1218   }
1219 
1220   private void shutdownWAL(final boolean close) {
1221     if (this.walFactory != null) {
1222       try {
1223         if (close) {
1224           walFactory.close();
1225         } else {
1226           walFactory.shutdown();
1227         }
1228       } catch (Throwable e) {
1229         e = RemoteExceptionHandler.checkThrowable(e);
1230         LOG.error("Shutdown / close of WAL failed: " + e);
1231         LOG.debug("Shutdown / close exception details:", e);
1232       }
1233     }
1234   }
1235 
1236   /*
1237    * Run init. Sets up wal and starts up all server threads.
1238    *
1239    * @param c Extra configuration.
1240    */
1241   protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
1242   throws IOException {
1243     try {
1244       for (NameStringPair e : c.getMapEntriesList()) {
1245         String key = e.getName();
1246         // The hostname the master sees us as.
1247         if (key.equals(HConstants.KEY_FOR_HOSTNAME_SEEN_BY_MASTER)) {
1248           String hostnameFromMasterPOV = e.getValue();
1249           this.serverName = ServerName.valueOf(hostnameFromMasterPOV,
1250             rpcServices.isa.getPort(), this.startcode);
1251           if (!hostnameFromMasterPOV.equals(rpcServices.isa.getHostName())) {
1252             LOG.info("Master passed us a different hostname to use; was=" +
1253               rpcServices.isa.getHostName() + ", but now=" + hostnameFromMasterPOV);
1254           }
1255           continue;
1256         }
1257         String value = e.getValue();
1258         if (LOG.isDebugEnabled()) {
1259           LOG.info("Config from master: " + key + "=" + value);
1260         }
1261         this.conf.set(key, value);
1262       }
1263 
1264       // hack! Maps DFSClient => RegionServer for logs.  HDFS made this
1265       // config param for task trackers, but we can piggyback off of it.
1266       if (this.conf.get("mapreduce.task.attempt.id") == null) {
1267         this.conf.set("mapreduce.task.attempt.id", "hb_rs_" +
1268           this.serverName.toString());
1269       }
1270 
1271       // Save it in a file, this will allow to see if we crash
1272       ZNodeClearer.writeMyEphemeralNodeOnDisk(getMyEphemeralNodePath());
1273 
1274       this.cacheConfig = new CacheConfig(conf);
1275       this.walFactory = setupWALAndReplication();
1276       // Init in here rather than in constructor after thread name has been set
1277       this.metricsRegionServer = new MetricsRegionServer(new MetricsRegionServerWrapperImpl(this));
1278 
1279       startServiceThreads();
1280       startHeapMemoryManager();
1281       LOG.info("Serving as " + this.serverName +
1282         ", RpcServer on " + rpcServices.isa +
1283         ", sessionid=0x" +
1284         Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
1285 
1286       // Wake up anyone waiting for this server to online
1287       synchronized (online) {
1288         online.set(true);
1289         online.notifyAll();
1290       }
1291     } catch (Throwable e) {
1292       stop("Failed initialization");
1293       throw convertThrowableToIOE(cleanup(e, "Failed init"),
1294           "Region server startup failed");
1295     } finally {
1296       sleeper.skipSleepCycle();
1297     }
1298   }
1299 
1300   private void startHeapMemoryManager() {
1301     this.hMemManager = HeapMemoryManager.create(this.conf, this.cacheFlusher, this);
1302     if (this.hMemManager != null) {
1303       this.hMemManager.start();
1304     }
1305   }
1306 
1307   private void createMyEphemeralNode() throws KeeperException, IOException {
1308     RegionServerInfo.Builder rsInfo = RegionServerInfo.newBuilder();
1309     rsInfo.setInfoPort(infoServer != null ? infoServer.getPort() : -1);
1310     byte[] data = ProtobufUtil.prependPBMagic(rsInfo.build().toByteArray());
1311     ZKUtil.createEphemeralNodeAndWatch(this.zooKeeper,
1312       getMyEphemeralNodePath(), data);
1313   }
1314 
1315   private void deleteMyEphemeralNode() throws KeeperException {
1316     ZKUtil.deleteNode(this.zooKeeper, getMyEphemeralNodePath());
1317   }
1318 
1319   @Override
1320   public RegionServerAccounting getRegionServerAccounting() {
1321     return regionServerAccounting;
1322   }
1323 
1324   @Override
1325   public TableLockManager getTableLockManager() {
1326     return tableLockManager;
1327   }
1328 
1329   /*
1330    * @param r Region to get RegionLoad for.
1331    * @param regionLoadBldr the RegionLoad.Builder, can be null
1332    * @param regionSpecifier the RegionSpecifier.Builder, can be null
1333    * @return RegionLoad instance.
1334    *
1335    * @throws IOException
1336    */
1337   private RegionLoad createRegionLoad(final HRegion r, RegionLoad.Builder regionLoadBldr,
1338       RegionSpecifier.Builder regionSpecifier) {
1339     byte[] name = r.getRegionName();
1340     int stores = 0;
1341     int storefiles = 0;
1342     int storeUncompressedSizeMB = 0;
1343     int storefileSizeMB = 0;
1344     int memstoreSizeMB = (int) (r.memstoreSize.get() / 1024 / 1024);
1345     int storefileIndexSizeMB = 0;
1346     int rootIndexSizeKB = 0;
1347     int totalStaticIndexSizeKB = 0;
1348     int totalStaticBloomSizeKB = 0;
1349     long totalCompactingKVs = 0;
1350     long currentCompactedKVs = 0;
1351     synchronized (r.stores) {
1352       stores += r.stores.size();
1353       for (Store store : r.stores.values()) {
1354         storefiles += store.getStorefilesCount();
1355         storeUncompressedSizeMB += (int) (store.getStoreSizeUncompressed()
1356             / 1024 / 1024);
1357         storefileSizeMB += (int) (store.getStorefilesSize() / 1024 / 1024);
1358         storefileIndexSizeMB += (int) (store.getStorefilesIndexSize() / 1024 / 1024);
1359         CompactionProgress progress = store.getCompactionProgress();
1360         if (progress != null) {
1361           totalCompactingKVs += progress.totalCompactingKVs;
1362           currentCompactedKVs += progress.currentCompactedKVs;
1363         }
1364 
1365         rootIndexSizeKB +=
1366             (int) (store.getStorefilesIndexSize() / 1024);
1367 
1368         totalStaticIndexSizeKB +=
1369           (int) (store.getTotalStaticIndexSize() / 1024);
1370 
1371         totalStaticBloomSizeKB +=
1372           (int) (store.getTotalStaticBloomSize() / 1024);
1373       }
1374     }
1375     float dataLocality =
1376         r.getHDFSBlocksDistribution().getBlockLocalityIndex(serverName.getHostname());
1377     if (regionLoadBldr == null) {
1378       regionLoadBldr = RegionLoad.newBuilder();
1379     }
1380     if (regionSpecifier == null) {
1381       regionSpecifier = RegionSpecifier.newBuilder();
1382     }
1383     regionSpecifier.setType(RegionSpecifierType.REGION_NAME);
1384     regionSpecifier.setValue(ByteStringer.wrap(name));
1385     regionLoadBldr.setRegionSpecifier(regionSpecifier.build())
1386       .setStores(stores)
1387       .setStorefiles(storefiles)
1388       .setStoreUncompressedSizeMB(storeUncompressedSizeMB)
1389       .setStorefileSizeMB(storefileSizeMB)
1390       .setMemstoreSizeMB(memstoreSizeMB)
1391       .setStorefileIndexSizeMB(storefileIndexSizeMB)
1392       .setRootIndexSizeKB(rootIndexSizeKB)
1393       .setTotalStaticIndexSizeKB(totalStaticIndexSizeKB)
1394       .setTotalStaticBloomSizeKB(totalStaticBloomSizeKB)
1395       .setReadRequestsCount(r.readRequestsCount.get())
1396       .setWriteRequestsCount(r.writeRequestsCount.get())
1397       .setTotalCompactingKVs(totalCompactingKVs)
1398       .setCurrentCompactedKVs(currentCompactedKVs)
1399       .setCompleteSequenceId(r.lastFlushSeqId)
1400       .setDataLocality(dataLocality);
1401 
1402     return regionLoadBldr.build();
1403   }
1404 
1405   /**
1406    * @param encodedRegionName
1407    * @return An instance of RegionLoad.
1408    */
1409   public RegionLoad createRegionLoad(final String encodedRegionName) {
1410     HRegion r = null;
1411     r = this.onlineRegions.get(encodedRegionName);
1412     return r != null ? createRegionLoad(r, null, null) : null;
1413   }
1414 
1415   /*
1416    * Inner class that runs on a long period checking if regions need compaction.
1417    */
1418   private static class CompactionChecker extends Chore {
1419     private final HRegionServer instance;
1420     private final int majorCompactPriority;
1421     private final static int DEFAULT_PRIORITY = Integer.MAX_VALUE;
1422     private long iteration = 0;
1423 
1424     CompactionChecker(final HRegionServer h, final int sleepTime,
1425         final Stoppable stopper) {
1426       super("CompactionChecker", sleepTime, h);
1427       this.instance = h;
1428       LOG.info(this.getName() + " runs every " + StringUtils.formatTime(sleepTime));
1429 
1430       /* MajorCompactPriority is configurable.
1431        * If not set, the compaction will use default priority.
1432        */
1433       this.majorCompactPriority = this.instance.conf.
1434         getInt("hbase.regionserver.compactionChecker.majorCompactPriority",
1435         DEFAULT_PRIORITY);
1436     }
1437 
1438     @Override
1439     protected void chore() {
1440       for (HRegion r : this.instance.onlineRegions.values()) {
1441         if (r == null)
1442           continue;
1443         for (Store s : r.getStores().values()) {
1444           try {
1445             long multiplier = s.getCompactionCheckMultiplier();
1446             assert multiplier > 0;
1447             if (iteration % multiplier != 0) continue;
1448             if (s.needsCompaction()) {
1449               // Queue a compaction. Will recognize if major is needed.
1450               this.instance.compactSplitThread.requestSystemCompaction(r, s, getName()
1451                   + " requests compaction");
1452             } else if (s.isMajorCompaction()) {
1453               if (majorCompactPriority == DEFAULT_PRIORITY
1454                   || majorCompactPriority > r.getCompactPriority()) {
1455                 this.instance.compactSplitThread.requestCompaction(r, s, getName()
1456                     + " requests major compaction; use default priority", null);
1457               } else {
1458                 this.instance.compactSplitThread.requestCompaction(r, s, getName()
1459                     + " requests major compaction; use configured priority",
1460                   this.majorCompactPriority, null);
1461               }
1462             }
1463           } catch (IOException e) {
1464             LOG.warn("Failed major compaction check on " + r, e);
1465           }
1466         }
1467       }
1468       iteration = (iteration == Long.MAX_VALUE) ? 0 : (iteration + 1);
1469     }
1470   }
1471 
1472   static class PeriodicMemstoreFlusher extends Chore {
1473     final HRegionServer server;
1474     final static int RANGE_OF_DELAY = 20000; //millisec
1475     final static int MIN_DELAY_TIME = 3000; //millisec
1476     public PeriodicMemstoreFlusher(int cacheFlushInterval, final HRegionServer server) {
1477       super(server.getServerName() + "-MemstoreFlusherChore", cacheFlushInterval, server);
1478       this.server = server;
1479     }
1480 
1481     @Override
1482     protected void chore() {
1483       for (HRegion r : this.server.onlineRegions.values()) {
1484         if (r == null)
1485           continue;
1486         if (r.shouldFlush()) {
1487           FlushRequester requester = server.getFlushRequester();
1488           if (requester != null) {
1489             long randomDelay = RandomUtils.nextInt(RANGE_OF_DELAY) + MIN_DELAY_TIME;
1490             LOG.info(getName() + " requesting flush for region " + r.getRegionNameAsString() +
1491                 " after a delay of " + randomDelay);
1492             //Throttle the flushes by putting a delay. If we don't throttle, and there
1493             //is a balanced write-load on the regions in a table, we might end up
1494             //overwhelming the filesystem with too many flushes at once.
1495             requester.requestDelayedFlush(r, randomDelay);
1496           }
1497         }
1498       }
1499     }
1500   }
1501 
1502   /**
1503    * Report the status of the server. A server is online once all the startup is
1504    * completed (setting up filesystem, starting service threads, etc.). This
1505    * method is designed mostly to be useful in tests.
1506    *
1507    * @return true if online, false if not.
1508    */
1509   public boolean isOnline() {
1510     return online.get();
1511   }
1512 
1513   /**
1514    * Setup WAL log and replication if enabled.
1515    * Replication setup is done in here because it wants to be hooked up to WAL.
1516    * @return A WAL instance.
1517    * @throws IOException
1518    */
1519   private WALFactory setupWALAndReplication() throws IOException {
1520     // TODO Replication make assumptions here based on the default filesystem impl
1521     final Path oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
1522     final String logName = DefaultWALProvider.getWALDirectoryName(this.serverName.toString());
1523 
1524     Path logdir = new Path(rootDir, logName);
1525     if (LOG.isDebugEnabled()) LOG.debug("logdir=" + logdir);
1526     if (this.fs.exists(logdir)) {
1527       throw new RegionServerRunningException("Region server has already " +
1528         "created directory at " + this.serverName.toString());
1529     }
1530 
1531     // Instantiate replication manager if replication enabled.  Pass it the
1532     // log directories.
1533     createNewReplicationInstance(conf, this, this.fs, logdir, oldLogDir);
1534 
1535     // listeners the wal factory will add to wals it creates.
1536     final List<WALActionsListener> listeners = new ArrayList<WALActionsListener>();
1537     listeners.add(new MetricsWAL());
1538     if (this.replicationSourceHandler != null &&
1539         this.replicationSourceHandler.getWALActionsListener() != null) {
1540       // Replication handler is an implementation of WALActionsListener.
1541       listeners.add(this.replicationSourceHandler.getWALActionsListener());
1542     }
1543 
1544     return new WALFactory(conf, listeners, serverName.toString());
1545   }
1546 
1547   /**
1548    * We initialize the roller for the wal that handles meta lazily
1549    * since we don't know if this regionserver will handle it. All calls to
1550    * this method return a reference to the that same roller. As newly referenced
1551    * meta regions are brought online, they will be offered to the roller for maintenance.
1552    * As a part of that registration process, the roller will add itself as a
1553    * listener on the wal.
1554    */
1555   protected LogRoller ensureMetaWALRoller() {
1556     // Using a tmp log roller to ensure metaLogRoller is alive once it is not
1557     // null
1558     LogRoller roller = metawalRoller.get();
1559     if (null == roller) {
1560       LogRoller tmpLogRoller = new LogRoller(this, this);
1561       String n = Thread.currentThread().getName();
1562       Threads.setDaemonThreadRunning(tmpLogRoller.getThread(),
1563           n + "-MetaLogRoller", uncaughtExceptionHandler);
1564       if (metawalRoller.compareAndSet(null, tmpLogRoller)) {
1565         roller = tmpLogRoller;
1566       } else {
1567         // Another thread won starting the roller
1568         Threads.shutdown(tmpLogRoller.getThread());
1569         roller = metawalRoller.get();
1570       }
1571     }
1572     return roller;
1573   }
1574 
1575   public MetricsRegionServer getRegionServerMetrics() {
1576     return this.metricsRegionServer;
1577   }
1578 
1579   /**
1580    * @return Master address tracker instance.
1581    */
1582   public MasterAddressTracker getMasterAddressTracker() {
1583     return this.masterAddressTracker;
1584   }
1585 
1586   /*
1587    * Start maintenance Threads, Server, Worker and lease checker threads.
1588    * Install an UncaughtExceptionHandler that calls abort of RegionServer if we
1589    * get an unhandled exception. We cannot set the handler on all threads.
1590    * Server's internal Listener thread is off limits. For Server, if an OOME, it
1591    * waits a while then retries. Meantime, a flush or a compaction that tries to
1592    * run should trigger same critical condition and the shutdown will run. On
1593    * its way out, this server will shut down Server. Leases are sort of
1594    * inbetween. It has an internal thread that while it inherits from Chore, it
1595    * keeps its own internal stop mechanism so needs to be stopped by this
1596    * hosting server. Worker logs the exception and exits.
1597    */
1598   private void startServiceThreads() throws IOException {
1599     // Start executor services
1600     this.service.startExecutorService(ExecutorType.RS_OPEN_REGION,
1601       conf.getInt("hbase.regionserver.executor.openregion.threads", 3));
1602     this.service.startExecutorService(ExecutorType.RS_OPEN_META,
1603       conf.getInt("hbase.regionserver.executor.openmeta.threads", 1));
1604     this.service.startExecutorService(ExecutorType.RS_CLOSE_REGION,
1605       conf.getInt("hbase.regionserver.executor.closeregion.threads", 3));
1606     this.service.startExecutorService(ExecutorType.RS_CLOSE_META,
1607       conf.getInt("hbase.regionserver.executor.closemeta.threads", 1));
1608     if (conf.getBoolean(StoreScanner.STORESCANNER_PARALLEL_SEEK_ENABLE, false)) {
1609       this.service.startExecutorService(ExecutorType.RS_PARALLEL_SEEK,
1610         conf.getInt("hbase.storescanner.parallel.seek.threads", 10));
1611     }
1612     this.service.startExecutorService(ExecutorType.RS_LOG_REPLAY_OPS, conf.getInt(
1613        "hbase.regionserver.wal.max.splitters", SplitLogWorkerCoordination.DEFAULT_MAX_SPLITTERS));
1614 
1615     Threads.setDaemonThreadRunning(this.walRoller.getThread(), getName() + ".logRoller",
1616         uncaughtExceptionHandler);
1617     this.cacheFlusher.start(uncaughtExceptionHandler);
1618     Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), getName() +
1619       ".compactionChecker", uncaughtExceptionHandler);
1620     Threads.setDaemonThreadRunning(this.periodicFlusher.getThread(), getName() +
1621         ".periodicFlusher", uncaughtExceptionHandler);
1622     if (this.healthCheckChore != null) {
1623       Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), getName() + ".healthChecker",
1624             uncaughtExceptionHandler);
1625     }
1626     if (this.nonceManagerChore != null) {
1627       Threads.setDaemonThreadRunning(this.nonceManagerChore.getThread(), getName() + ".nonceCleaner",
1628             uncaughtExceptionHandler);
1629     }
1630     if (this.storefileRefresher != null) {
1631       Threads.setDaemonThreadRunning(this.storefileRefresher.getThread(), getName() + ".storefileRefresher",
1632             uncaughtExceptionHandler);
1633     }
1634 
1635     // Leases is not a Thread. Internally it runs a daemon thread. If it gets
1636     // an unhandled exception, it will just exit.
1637     this.leases.setName(getName() + ".leaseChecker");
1638     this.leases.start();
1639 
1640     if (this.replicationSourceHandler == this.replicationSinkHandler &&
1641         this.replicationSourceHandler != null) {
1642       this.replicationSourceHandler.startReplicationService();
1643     } else {
1644       if (this.replicationSourceHandler != null) {
1645         this.replicationSourceHandler.startReplicationService();
1646       }
1647       if (this.replicationSinkHandler != null) {
1648         this.replicationSinkHandler.startReplicationService();
1649       }
1650     }
1651 
1652     // Create the log splitting worker and start it
1653     // set a smaller retries to fast fail otherwise splitlogworker could be blocked for
1654     // quite a while inside HConnection layer. The worker won't be available for other
1655     // tasks even after current task is preempted after a split task times out.
1656     Configuration sinkConf = HBaseConfiguration.create(conf);
1657     sinkConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
1658       conf.getInt("hbase.log.replay.retries.number", 8)); // 8 retries take about 23 seconds
1659     sinkConf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY,
1660       conf.getInt("hbase.log.replay.rpc.timeout", 30000)); // default 30 seconds
1661     sinkConf.setInt("hbase.client.serverside.retries.multiplier", 1);
1662     this.splitLogWorker = new SplitLogWorker(this, sinkConf, this, this, walFactory);
1663     splitLogWorker.start();
1664   }
1665 
1666   /**
1667    * Puts up the webui.
1668    * @return Returns final port -- maybe different from what we started with.
1669    * @throws IOException
1670    */
1671   private int putUpWebUI() throws IOException {
1672     int port = this.conf.getInt(HConstants.REGIONSERVER_INFO_PORT,
1673       HConstants.DEFAULT_REGIONSERVER_INFOPORT);
1674     String addr = this.conf.get("hbase.regionserver.info.bindAddress", "0.0.0.0");
1675 
1676     if(this instanceof HMaster) {
1677       port = conf.getInt(HConstants.MASTER_INFO_PORT,
1678           HConstants.DEFAULT_MASTER_INFOPORT);
1679       addr = this.conf.get("hbase.master.info.bindAddress", "0.0.0.0");
1680     }
1681     // -1 is for disabling info server
1682     if (port < 0) return port;
1683 
1684     if (!Addressing.isLocalAddress(InetAddress.getByName(addr))) {
1685       String msg =
1686           "Failed to start http info server. Address " + addr
1687               + " does not belong to this host. Correct configuration parameter: "
1688               + "hbase.regionserver.info.bindAddress";
1689       LOG.error(msg);
1690       throw new IOException(msg);
1691     }
1692     // check if auto port bind enabled
1693     boolean auto = this.conf.getBoolean(HConstants.REGIONSERVER_INFO_PORT_AUTO,
1694         false);
1695     while (true) {
1696       try {
1697         this.infoServer = new InfoServer(getProcessName(), addr, port, false, this.conf);
1698         infoServer.addServlet("dump", "/dump", getDumpServlet());
1699         configureInfoServer();
1700         this.infoServer.start();
1701         break;
1702       } catch (BindException e) {
1703         if (!auto) {
1704           // auto bind disabled throw BindException
1705           LOG.error("Failed binding http info server to port: " + port);
1706           throw e;
1707         }
1708         // auto bind enabled, try to use another port
1709         LOG.info("Failed binding http info server to port: " + port);
1710         port++;
1711       }
1712     }
1713     port = this.infoServer.getPort();
1714     conf.setInt(HConstants.REGIONSERVER_INFO_PORT, port);
1715     int masterInfoPort = conf.getInt(HConstants.MASTER_INFO_PORT,
1716       HConstants.DEFAULT_MASTER_INFOPORT);
1717     conf.setInt("hbase.master.info.port.orig", masterInfoPort);
1718     conf.setInt(HConstants.MASTER_INFO_PORT, port);
1719     return port;
1720   }
1721 
1722   /*
1723    * Verify that server is healthy
1724    */
1725   private boolean isHealthy() {
1726     if (!fsOk) {
1727       // File system problem
1728       return false;
1729     }
1730     // Verify that all threads are alive
1731     if (!(leases.isAlive()
1732         && cacheFlusher.isAlive() && walRoller.isAlive()
1733         && this.compactionChecker.isAlive()
1734         && this.periodicFlusher.isAlive())) {
1735       stop("One or more threads are no longer alive -- stop");
1736       return false;
1737     }
1738     final LogRoller metawalRoller = this.metawalRoller.get();
1739     if (metawalRoller != null && !metawalRoller.isAlive()) {
1740       stop("Meta WAL roller thread is no longer alive -- stop");
1741       return false;
1742     }
1743     return true;
1744   }
1745 
1746   private static final byte[] UNSPECIFIED_REGION = new byte[]{};
1747 
1748   @Override
1749   public WAL getWAL(HRegionInfo regionInfo) throws IOException {
1750     WAL wal;
1751     LogRoller roller = walRoller;
1752     //_ROOT_ and hbase:meta regions have separate WAL.
1753     if (regionInfo != null && regionInfo.isMetaTable()) {
1754       roller = ensureMetaWALRoller();
1755       wal = walFactory.getMetaWAL(regionInfo.getEncodedNameAsBytes());
1756     } else if (regionInfo == null) {
1757       wal = walFactory.getWAL(UNSPECIFIED_REGION);
1758     } else {
1759       wal = walFactory.getWAL(regionInfo.getEncodedNameAsBytes());
1760     }
1761     roller.addWAL(wal);
1762     return wal;
1763   }
1764 
1765   @Override
1766   public ClusterConnection getConnection() {
1767     return this.clusterConnection;
1768   }
1769 
1770   @Override
1771   public MetaTableLocator getMetaTableLocator() {
1772     return this.metaTableLocator;
1773   }
1774 
1775   @Override
1776   public void stop(final String msg) {
1777     if (!this.stopped) {
1778       try {
1779         if (this.rsHost != null) {
1780           this.rsHost.preStop(msg);
1781         }
1782         this.stopped = true;
1783         LOG.info("STOPPED: " + msg);
1784         // Wakes run() if it is sleeping
1785         sleeper.skipSleepCycle();
1786       } catch (IOException exp) {
1787         LOG.warn("The region server did not stop", exp);
1788       }
1789     }
1790   }
1791 
1792   public void waitForServerOnline(){
1793     while (!isStopped() && !isOnline()) {
1794       synchronized (online) {
1795         try {
1796           online.wait(msgInterval);
1797         } catch (InterruptedException ie) {
1798           Thread.currentThread().interrupt();
1799           break;
1800         }
1801       }
1802     }
1803   }
1804 
1805   @Override
1806   public void postOpenDeployTasks(final HRegion r)
1807   throws KeeperException, IOException {
1808     rpcServices.checkOpen();
1809     LOG.info("Post open deploy tasks for " + r.getRegionNameAsString());
1810     // Do checks to see if we need to compact (references or too many files)
1811     for (Store s : r.getStores().values()) {
1812       if (s.hasReferences() || s.needsCompaction()) {
1813        this.compactSplitThread.requestSystemCompaction(r, s, "Opening Region");
1814       }
1815     }
1816     long openSeqNum = r.getOpenSeqNum();
1817     if (openSeqNum == HConstants.NO_SEQNUM) {
1818       // If we opened a region, we should have read some sequence number from it.
1819       LOG.error("No sequence number found when opening " + r.getRegionNameAsString());
1820       openSeqNum = 0;
1821     }
1822 
1823     // Update flushed sequence id of a recovering region in ZK
1824     updateRecoveringRegionLastFlushedSequenceId(r);
1825 
1826     // Update ZK, or META
1827     if (r.getRegionInfo().isMetaRegion()) {
1828       MetaTableLocator.setMetaLocation(getZooKeeper(), serverName, State.OPEN);
1829     } else if (useZKForAssignment) {
1830       MetaTableAccessor.updateRegionLocation(getConnection(), r.getRegionInfo(),
1831         this.serverName, openSeqNum);
1832     }
1833     if (!useZKForAssignment && !reportRegionStateTransition(
1834         TransitionCode.OPENED, openSeqNum, r.getRegionInfo())) {
1835       throw new IOException("Failed to report opened region to master: "
1836         + r.getRegionNameAsString());
1837     }
1838 
1839     LOG.debug("Finished post open deploy task for " + r.getRegionNameAsString());
1840   }
1841 
1842   @Override
1843   public boolean reportRegionStateTransition(TransitionCode code, HRegionInfo... hris) {
1844     return reportRegionStateTransition(code, HConstants.NO_SEQNUM, hris);
1845   }
1846 
1847   @Override
1848   public boolean reportRegionStateTransition(
1849       TransitionCode code, long openSeqNum, HRegionInfo... hris) {
1850     ReportRegionStateTransitionRequest.Builder builder =
1851       ReportRegionStateTransitionRequest.newBuilder();
1852     builder.setServer(ProtobufUtil.toServerName(serverName));
1853     RegionStateTransition.Builder transition = builder.addTransitionBuilder();
1854     transition.setTransitionCode(code);
1855     if (code == TransitionCode.OPENED && openSeqNum >= 0) {
1856       transition.setOpenSeqNum(openSeqNum);
1857     }
1858     for (HRegionInfo hri: hris) {
1859       transition.addRegionInfo(HRegionInfo.convert(hri));
1860     }
1861     ReportRegionStateTransitionRequest request = builder.build();
1862     while (keepLooping()) {
1863       RegionServerStatusService.BlockingInterface rss = rssStub;
1864       try {
1865         if (rss == null) {
1866           createRegionServerStatusStub();
1867           continue;
1868         }
1869         ReportRegionStateTransitionResponse response =
1870           rss.reportRegionStateTransition(null, request);
1871         if (response.hasErrorMessage()) {
1872           LOG.info("Failed to transition " + hris[0]
1873             + " to " + code + ": " + response.getErrorMessage());
1874           return false;
1875         }
1876         return true;
1877       } catch (ServiceException se) {
1878         IOException ioe = ProtobufUtil.getRemoteException(se);
1879         LOG.info("Failed to report region transition, will retry", ioe);
1880         if (rssStub == rss) {
1881           rssStub = null;
1882         }
1883       }
1884     }
1885     return false;
1886   }
1887 
1888   @Override
1889   public RpcServerInterface getRpcServer() {
1890     return rpcServices.rpcServer;
1891   }
1892 
1893   @VisibleForTesting
1894   public RSRpcServices getRSRpcServices() {
1895     return rpcServices;
1896   }
1897 
1898   /**
1899    * Cause the server to exit without closing the regions it is serving, the log
1900    * it is using and without notifying the master. Used unit testing and on
1901    * catastrophic events such as HDFS is yanked out from under hbase or we OOME.
1902    *
1903    * @param reason
1904    *          the reason we are aborting
1905    * @param cause
1906    *          the exception that caused the abort, or null
1907    */
1908   @Override
1909   public void abort(String reason, Throwable cause) {
1910     String msg = "ABORTING region server " + this + ": " + reason;
1911     if (cause != null) {
1912       LOG.fatal(msg, cause);
1913     } else {
1914       LOG.fatal(msg);
1915     }
1916     this.abortRequested = true;
1917     // HBASE-4014: show list of coprocessors that were loaded to help debug
1918     // regionserver crashes.Note that we're implicitly using
1919     // java.util.HashSet's toString() method to print the coprocessor names.
1920     LOG.fatal("RegionServer abort: loaded coprocessors are: " +
1921         CoprocessorHost.getLoadedCoprocessors());
1922     // Try and dump metrics if abort -- might give clue as to how fatal came about....
1923     try {
1924       LOG.info("Dump of metrics as JSON on abort: " + JSONBean.dumpRegionServerMetrics());
1925     } catch (MalformedObjectNameException | IOException e) {
1926       LOG.warn("Failed dumping metrics", e);
1927     }
1928 
1929     // Do our best to report our abort to the master, but this may not work
1930     try {
1931       if (cause != null) {
1932         msg += "\nCause:\n" + StringUtils.stringifyException(cause);
1933       }
1934       // Report to the master but only if we have already registered with the master.
1935       if (rssStub != null && this.serverName != null) {
1936         ReportRSFatalErrorRequest.Builder builder =
1937           ReportRSFatalErrorRequest.newBuilder();
1938         ServerName sn =
1939           ServerName.parseVersionedServerName(this.serverName.getVersionedBytes());
1940         builder.setServer(ProtobufUtil.toServerName(sn));
1941         builder.setErrorMessage(msg);
1942         rssStub.reportRSFatalError(null, builder.build());
1943       }
1944     } catch (Throwable t) {
1945       LOG.warn("Unable to report fatal error to master", t);
1946     }
1947     stop(reason);
1948   }
1949 
1950   /**
1951    * @see HRegionServer#abort(String, Throwable)
1952    */
1953   public void abort(String reason) {
1954     abort(reason, null);
1955   }
1956 
1957   @Override
1958   public boolean isAborted() {
1959     return this.abortRequested;
1960   }
1961 
1962   /*
1963    * Simulate a kill -9 of this server. Exits w/o closing regions or cleaninup
1964    * logs but it does close socket in case want to bring up server on old
1965    * hostname+port immediately.
1966    */
1967   protected void kill() {
1968     this.killed = true;
1969     abort("Simulated kill");
1970   }
1971 
1972   /**
1973    * Wait on all threads to finish. Presumption is that all closes and stops
1974    * have already been called.
1975    */
1976   protected void stopServiceThreads() {
1977     if (this.nonceManagerChore != null) {
1978       Threads.shutdown(this.nonceManagerChore.getThread());
1979     }
1980     if (this.compactionChecker != null) {
1981       Threads.shutdown(this.compactionChecker.getThread());
1982     }
1983     if (this.periodicFlusher != null) {
1984       Threads.shutdown(this.periodicFlusher.getThread());
1985     }
1986     if (this.cacheFlusher != null) {
1987       this.cacheFlusher.join();
1988     }
1989     if (this.healthCheckChore != null) {
1990       Threads.shutdown(this.healthCheckChore.getThread());
1991     }
1992     if (this.spanReceiverHost != null) {
1993       this.spanReceiverHost.closeReceivers();
1994     }
1995     if (this.walRoller != null) {
1996       Threads.shutdown(this.walRoller.getThread());
1997     }
1998     final LogRoller metawalRoller = this.metawalRoller.get();
1999     if (metawalRoller != null) {
2000       Threads.shutdown(metawalRoller.getThread());
2001     }
2002     if (this.compactSplitThread != null) {
2003       this.compactSplitThread.join();
2004     }
2005     if (this.service != null) this.service.shutdown();
2006     if (this.replicationSourceHandler != null &&
2007         this.replicationSourceHandler == this.replicationSinkHandler) {
2008       this.replicationSourceHandler.stopReplicationService();
2009     } else {
2010       if (this.replicationSourceHandler != null) {
2011         this.replicationSourceHandler.stopReplicationService();
2012       }
2013       if (this.replicationSinkHandler != null) {
2014         this.replicationSinkHandler.stopReplicationService();
2015       }
2016     }
2017     if (this.storefileRefresher != null) {
2018       Threads.shutdown(this.storefileRefresher.getThread());
2019     }
2020   }
2021 
2022   /**
2023    * @return Return the object that implements the replication
2024    * source service.
2025    */
2026   ReplicationSourceService getReplicationSourceService() {
2027     return replicationSourceHandler;
2028   }
2029 
2030   /**
2031    * @return Return the object that implements the replication
2032    * sink service.
2033    */
2034   ReplicationSinkService getReplicationSinkService() {
2035     return replicationSinkHandler;
2036   }
2037 
2038   /**
2039    * Get the current master from ZooKeeper and open the RPC connection to it.
2040    * To get a fresh connection, the current rssStub must be null.
2041    * Method will block until a master is available. You can break from this
2042    * block by requesting the server stop.
2043    *
2044    * @return master + port, or null if server has been stopped
2045    */
2046   @VisibleForTesting
2047   protected synchronized ServerName createRegionServerStatusStub() {
2048     if (rssStub != null) {
2049       return masterAddressTracker.getMasterAddress();
2050     }
2051     ServerName sn = null;
2052     long previousLogTime = 0;
2053     boolean refresh = false; // for the first time, use cached data
2054     RegionServerStatusService.BlockingInterface intf = null;
2055     boolean interrupted = false;
2056     try {
2057       while (keepLooping()) {
2058         sn = this.masterAddressTracker.getMasterAddress(refresh);
2059         if (sn == null) {
2060           if (!keepLooping()) {
2061             // give up with no connection.
2062             LOG.debug("No master found and cluster is stopped; bailing out");
2063             return null;
2064           }
2065           if (System.currentTimeMillis() > (previousLogTime + 1000)) {
2066             LOG.debug("No master found; retry");
2067             previousLogTime = System.currentTimeMillis();
2068           }
2069           refresh = true; // let's try pull it from ZK directly
2070           if (sleep(200)) {
2071             interrupted = true;
2072           }
2073           continue;
2074         }
2075 
2076         // If we are on the active master, use the shortcut
2077         if (this instanceof HMaster && sn.equals(getServerName())) {
2078           intf = ((HMaster)this).getMasterRpcServices();
2079           break;
2080         }
2081         try {
2082           BlockingRpcChannel channel =
2083             this.rpcClient.createBlockingRpcChannel(sn, userProvider.getCurrent(), operationTimeout);
2084           intf = RegionServerStatusService.newBlockingStub(channel);
2085           break;
2086         } catch (IOException e) {
2087           if (System.currentTimeMillis() > (previousLogTime + 1000)) {
2088             e = e instanceof RemoteException ?
2089               ((RemoteException)e).unwrapRemoteException() : e;
2090             if (e instanceof ServerNotRunningYetException) {
2091               LOG.info("Master isn't available yet, retrying");
2092             } else {
2093               LOG.warn("Unable to connect to master. Retrying. Error was:", e);
2094             }
2095             previousLogTime = System.currentTimeMillis();
2096           }
2097           if (sleep(200)) {
2098             interrupted = true;
2099           }
2100         }
2101       }
2102     } finally {
2103       if (interrupted) {
2104         Thread.currentThread().interrupt();
2105       }
2106     }
2107     rssStub = intf;
2108     return sn;
2109   }
2110 
2111   /**
2112    * @return True if we should break loop because cluster is going down or
2113    * this server has been stopped or hdfs has gone bad.
2114    */
2115   private boolean keepLooping() {
2116     return !this.stopped && isClusterUp();
2117   }
2118 
2119   /*
2120    * Let the master know we're here Run initialization using parameters passed
2121    * us by the master.
2122    * @return A Map of key/value configurations we got from the Master else
2123    * null if we failed to register.
2124    * @throws IOException
2125    */
2126   private RegionServerStartupResponse reportForDuty() throws IOException {
2127     ServerName masterServerName = createRegionServerStatusStub();
2128     if (masterServerName == null) return null;
2129     RegionServerStartupResponse result = null;
2130     try {
2131       rpcServices.requestCount.set(0);
2132       LOG.info("reportForDuty to master=" + masterServerName + " with port="
2133         + rpcServices.isa.getPort() + ", startcode=" + this.startcode);
2134       long now = EnvironmentEdgeManager.currentTime();
2135       int port = rpcServices.isa.getPort();
2136       RegionServerStartupRequest.Builder request = RegionServerStartupRequest.newBuilder();
2137       request.setPort(port);
2138       request.setServerStartCode(this.startcode);
2139       request.setServerCurrentTime(now);
2140       result = this.rssStub.regionServerStartup(null, request.build());
2141     } catch (ServiceException se) {
2142       IOException ioe = ProtobufUtil.getRemoteException(se);
2143       if (ioe instanceof ClockOutOfSyncException) {
2144         LOG.fatal("Master rejected startup because clock is out of sync", ioe);
2145         // Re-throw IOE will cause RS to abort
2146         throw ioe;
2147       } else if (ioe instanceof ServerNotRunningYetException) {
2148         LOG.debug("Master is not running yet");
2149       } else {
2150         LOG.warn("error telling master we are up", se);
2151         rssStub = null;
2152       }
2153     }
2154     return result;
2155   }
2156 
2157   @Override
2158   public long getLastSequenceId(byte[] encodedRegionName) {
2159     long lastFlushedSequenceId = -1L;
2160     try {
2161       GetLastFlushedSequenceIdRequest req = RequestConverter
2162           .buildGetLastFlushedSequenceIdRequest(encodedRegionName);
2163       RegionServerStatusService.BlockingInterface rss = rssStub;
2164       if (rss == null) { // Try to connect one more time
2165         createRegionServerStatusStub();
2166         rss = rssStub;
2167         if (rss == null) {
2168           // Still no luck, we tried
2169           LOG.warn("Unable to connect to the master to check "
2170             + "the last flushed sequence id");
2171           return -1L;
2172         }
2173       }
2174       lastFlushedSequenceId = rss.getLastFlushedSequenceId(null, req)
2175           .getLastFlushedSequenceId();
2176     } catch (ServiceException e) {
2177       lastFlushedSequenceId = -1l;
2178       LOG.warn("Unable to connect to the master to check "
2179         + "the last flushed sequence id", e);
2180     }
2181     return lastFlushedSequenceId;
2182   }
2183 
2184   /**
2185    * Closes all regions.  Called on our way out.
2186    * Assumes that its not possible for new regions to be added to onlineRegions
2187    * while this method runs.
2188    */
2189   protected void closeAllRegions(final boolean abort) {
2190     closeUserRegions(abort);
2191     closeMetaTableRegions(abort);
2192   }
2193 
2194   /**
2195    * Close meta region if we carry it
2196    * @param abort Whether we're running an abort.
2197    */
2198   void closeMetaTableRegions(final boolean abort) {
2199     HRegion meta = null;
2200     this.lock.writeLock().lock();
2201     try {
2202       for (Map.Entry<String, HRegion> e: onlineRegions.entrySet()) {
2203         HRegionInfo hri = e.getValue().getRegionInfo();
2204         if (hri.isMetaRegion()) {
2205           meta = e.getValue();
2206         }
2207         if (meta != null) break;
2208       }
2209     } finally {
2210       this.lock.writeLock().unlock();
2211     }
2212     if (meta != null) closeRegionIgnoreErrors(meta.getRegionInfo(), abort);
2213   }
2214 
2215   /**
2216    * Schedule closes on all user regions.
2217    * Should be safe calling multiple times because it wont' close regions
2218    * that are already closed or that are closing.
2219    * @param abort Whether we're running an abort.
2220    */
2221   void closeUserRegions(final boolean abort) {
2222     this.lock.writeLock().lock();
2223     try {
2224       for (Map.Entry<String, HRegion> e: this.onlineRegions.entrySet()) {
2225         HRegion r = e.getValue();
2226         if (!r.getRegionInfo().isMetaTable() && r.isAvailable()) {
2227           // Don't update zk with this close transition; pass false.
2228           closeRegionIgnoreErrors(r.getRegionInfo(), abort);
2229         }
2230       }
2231     } finally {
2232       this.lock.writeLock().unlock();
2233     }
2234   }
2235 
2236   /** @return the info server */
2237   public InfoServer getInfoServer() {
2238     return infoServer;
2239   }
2240 
2241   /**
2242    * @return true if a stop has been requested.
2243    */
2244   @Override
2245   public boolean isStopped() {
2246     return this.stopped;
2247   }
2248 
2249   @Override
2250   public boolean isStopping() {
2251     return this.stopping;
2252   }
2253 
2254   @Override
2255   public Map<String, HRegion> getRecoveringRegions() {
2256     return this.recoveringRegions;
2257   }
2258 
2259   /**
2260    *
2261    * @return the configuration
2262    */
2263   @Override
2264   public Configuration getConfiguration() {
2265     return conf;
2266   }
2267 
2268   /** @return the write lock for the server */
2269   ReentrantReadWriteLock.WriteLock getWriteLock() {
2270     return lock.writeLock();
2271   }
2272 
2273   public int getNumberOfOnlineRegions() {
2274     return this.onlineRegions.size();
2275   }
2276 
2277   boolean isOnlineRegionsEmpty() {
2278     return this.onlineRegions.isEmpty();
2279   }
2280 
2281   /**
2282    * For tests, web ui and metrics.
2283    * This method will only work if HRegionServer is in the same JVM as client;
2284    * HRegion cannot be serialized to cross an rpc.
2285    */
2286   public Collection<HRegion> getOnlineRegionsLocalContext() {
2287     Collection<HRegion> regions = this.onlineRegions.values();
2288     return Collections.unmodifiableCollection(regions);
2289   }
2290 
2291   @Override
2292   public void addToOnlineRegions(HRegion region) {
2293     this.onlineRegions.put(region.getRegionInfo().getEncodedName(), region);
2294     configurationManager.registerObserver(region);
2295   }
2296 
2297   /**
2298    * @return A new Map of online regions sorted by region size with the first entry being the
2299    * biggest.  If two regions are the same size, then the last one found wins; i.e. this method
2300    * may NOT return all regions.
2301    */
2302   SortedMap<Long, HRegion> getCopyOfOnlineRegionsSortedBySize() {
2303     // we'll sort the regions in reverse
2304     SortedMap<Long, HRegion> sortedRegions = new TreeMap<Long, HRegion>(
2305         new Comparator<Long>() {
2306           @Override
2307           public int compare(Long a, Long b) {
2308             return -1 * a.compareTo(b);
2309           }
2310         });
2311     // Copy over all regions. Regions are sorted by size with biggest first.
2312     for (HRegion region : this.onlineRegions.values()) {
2313       sortedRegions.put(region.memstoreSize.get(), region);
2314     }
2315     return sortedRegions;
2316   }
2317 
2318   /**
2319    * @return time stamp in millis of when this region server was started
2320    */
2321   public long getStartcode() {
2322     return this.startcode;
2323   }
2324 
2325   /** @return reference to FlushRequester */
2326   @Override
2327   public FlushRequester getFlushRequester() {
2328     return this.cacheFlusher;
2329   }
2330 
2331   /**
2332    * Get the top N most loaded regions this server is serving so we can tell the
2333    * master which regions it can reallocate if we're overloaded. TODO: actually
2334    * calculate which regions are most loaded. (Right now, we're just grabbing
2335    * the first N regions being served regardless of load.)
2336    */
2337   protected HRegionInfo[] getMostLoadedRegions() {
2338     ArrayList<HRegionInfo> regions = new ArrayList<HRegionInfo>();
2339     for (HRegion r : onlineRegions.values()) {
2340       if (!r.isAvailable()) {
2341         continue;
2342       }
2343       if (regions.size() < numRegionsToReport) {
2344         regions.add(r.getRegionInfo());
2345       } else {
2346         break;
2347       }
2348     }
2349     return regions.toArray(new HRegionInfo[regions.size()]);
2350   }
2351 
2352   @Override
2353   public Leases getLeases() {
2354     return leases;
2355   }
2356 
2357   /**
2358    * @return Return the rootDir.
2359    */
2360   protected Path getRootDir() {
2361     return rootDir;
2362   }
2363 
2364   /**
2365    * @return Return the fs.
2366    */
2367   @Override
2368   public FileSystem getFileSystem() {
2369     return fs;
2370   }
2371 
2372   @Override
2373   public String toString() {
2374     return getServerName().toString();
2375   }
2376 
2377   /**
2378    * Interval at which threads should run
2379    *
2380    * @return the interval
2381    */
2382   public int getThreadWakeFrequency() {
2383     return threadWakeFrequency;
2384   }
2385 
2386   @Override
2387   public ZooKeeperWatcher getZooKeeper() {
2388     return zooKeeper;
2389   }
2390 
2391   @Override
2392   public BaseCoordinatedStateManager getCoordinatedStateManager() {
2393     return csm;
2394   }
2395 
2396   @Override
2397   public ServerName getServerName() {
2398     return serverName;
2399   }
2400 
2401   @Override
2402   public CompactionRequestor getCompactionRequester() {
2403     return this.compactSplitThread;
2404   }
2405 
2406   public RegionServerCoprocessorHost getRegionServerCoprocessorHost(){
2407     return this.rsHost;
2408   }
2409 
2410   @Override
2411   public ConcurrentMap<byte[], Boolean> getRegionsInTransitionInRS() {
2412     return this.regionsInTransitionInRS;
2413   }
2414 
2415   @Override
2416   public ExecutorService getExecutorService() {
2417     return service;
2418   }
2419 
2420   //
2421   // Main program and support routines
2422   //
2423 
2424   /**
2425    * Load the replication service objects, if any
2426    */
2427   static private void createNewReplicationInstance(Configuration conf,
2428     HRegionServer server, FileSystem fs, Path logDir, Path oldLogDir) throws IOException{
2429 
2430     // If replication is not enabled, then return immediately.
2431     if (!conf.getBoolean(HConstants.REPLICATION_ENABLE_KEY,
2432         HConstants.REPLICATION_ENABLE_DEFAULT)) {
2433       return;
2434     }
2435 
2436     // read in the name of the source replication class from the config file.
2437     String sourceClassname = conf.get(HConstants.REPLICATION_SOURCE_SERVICE_CLASSNAME,
2438                                HConstants.REPLICATION_SERVICE_CLASSNAME_DEFAULT);
2439 
2440     // read in the name of the sink replication class from the config file.
2441     String sinkClassname = conf.get(HConstants.REPLICATION_SINK_SERVICE_CLASSNAME,
2442                              HConstants.REPLICATION_SERVICE_CLASSNAME_DEFAULT);
2443 
2444     // If both the sink and the source class names are the same, then instantiate
2445     // only one object.
2446     if (sourceClassname.equals(sinkClassname)) {
2447       server.replicationSourceHandler = (ReplicationSourceService)
2448                                          newReplicationInstance(sourceClassname,
2449                                          conf, server, fs, logDir, oldLogDir);
2450       server.replicationSinkHandler = (ReplicationSinkService)
2451                                          server.replicationSourceHandler;
2452     } else {
2453       server.replicationSourceHandler = (ReplicationSourceService)
2454                                          newReplicationInstance(sourceClassname,
2455                                          conf, server, fs, logDir, oldLogDir);
2456       server.replicationSinkHandler = (ReplicationSinkService)
2457                                          newReplicationInstance(sinkClassname,
2458                                          conf, server, fs, logDir, oldLogDir);
2459     }
2460   }
2461 
2462   static private ReplicationService newReplicationInstance(String classname,
2463     Configuration conf, HRegionServer server, FileSystem fs, Path logDir,
2464     Path oldLogDir) throws IOException{
2465 
2466     Class<?> clazz = null;
2467     try {
2468       ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
2469       clazz = Class.forName(classname, true, classLoader);
2470     } catch (java.lang.ClassNotFoundException nfe) {
2471       throw new IOException("Could not find class for " + classname);
2472     }
2473 
2474     // create an instance of the replication object.
2475     ReplicationService service = (ReplicationService)
2476                               ReflectionUtils.newInstance(clazz, conf);
2477     service.initialize(server, fs, logDir, oldLogDir);
2478     return service;
2479   }
2480 
2481   /**
2482    * Utility for constructing an instance of the passed HRegionServer class.
2483    *
2484    * @param regionServerClass
2485    * @param conf2
2486    * @return HRegionServer instance.
2487    */
2488   public static HRegionServer constructRegionServer(
2489       Class<? extends HRegionServer> regionServerClass,
2490       final Configuration conf2, CoordinatedStateManager cp) {
2491     try {
2492       Constructor<? extends HRegionServer> c = regionServerClass
2493           .getConstructor(Configuration.class, CoordinatedStateManager.class);
2494       return c.newInstance(conf2, cp);
2495     } catch (Exception e) {
2496       throw new RuntimeException("Failed construction of " + "Regionserver: "
2497           + regionServerClass.toString(), e);
2498     }
2499   }
2500 
2501   /**
2502    * @see org.apache.hadoop.hbase.regionserver.HRegionServerCommandLine
2503    */
2504   public static void main(String[] args) throws Exception {
2505     VersionInfo.logVersion();
2506     Configuration conf = HBaseConfiguration.create();
2507     @SuppressWarnings("unchecked")
2508     Class<? extends HRegionServer> regionServerClass = (Class<? extends HRegionServer>) conf
2509         .getClass(HConstants.REGION_SERVER_IMPL, HRegionServer.class);
2510 
2511     new HRegionServerCommandLine(regionServerClass).doMain(args);
2512   }
2513 
2514   /**
2515    * Gets the online regions of the specified table.
2516    * This method looks at the in-memory onlineRegions.  It does not go to <code>hbase:meta</code>.
2517    * Only returns <em>online</em> regions.  If a region on this table has been
2518    * closed during a disable, etc., it will not be included in the returned list.
2519    * So, the returned list may not necessarily be ALL regions in this table, its
2520    * all the ONLINE regions in the table.
2521    * @param tableName
2522    * @return Online regions from <code>tableName</code>
2523    */
2524   @Override
2525   public List<HRegion> getOnlineRegions(TableName tableName) {
2526      List<HRegion> tableRegions = new ArrayList<HRegion>();
2527      synchronized (this.onlineRegions) {
2528        for (HRegion region: this.onlineRegions.values()) {
2529          HRegionInfo regionInfo = region.getRegionInfo();
2530          if(regionInfo.getTable().equals(tableName)) {
2531            tableRegions.add(region);
2532          }
2533        }
2534      }
2535      return tableRegions;
2536    }
2537 
2538   // used by org/apache/hbase/tmpl/regionserver/RSStatusTmpl.jamon (HBASE-4070).
2539   public String[] getRegionServerCoprocessors() {
2540     TreeSet<String> coprocessors = new TreeSet<String>();
2541     try {
2542       coprocessors.addAll(getWAL(null).getCoprocessorHost().getCoprocessors());
2543     } catch (IOException exception) {
2544       LOG.warn("Exception attempting to fetch wal coprocessor information for the common wal; " +
2545           "skipping.");
2546       LOG.debug("Exception details for failure to fetch wal coprocessor information.", exception);
2547     }
2548     Collection<HRegion> regions = getOnlineRegionsLocalContext();
2549     for (HRegion region: regions) {
2550       coprocessors.addAll(region.getCoprocessorHost().getCoprocessors());
2551       try {
2552         coprocessors.addAll(getWAL(region.getRegionInfo()).getCoprocessorHost().getCoprocessors());
2553       } catch (IOException exception) {
2554         LOG.warn("Exception attempting to fetch wal coprocessor information for region " + region +
2555             "; skipping.");
2556         LOG.debug("Exception details for failure to fetch wal coprocessor information.", exception);
2557       }
2558     }
2559     return coprocessors.toArray(new String[coprocessors.size()]);
2560   }
2561 
2562   /**
2563    * Try to close the region, logs a warning on failure but continues.
2564    * @param region Region to close
2565    */
2566   private void closeRegionIgnoreErrors(HRegionInfo region, final boolean abort) {
2567     try {
2568       CloseRegionCoordination.CloseRegionDetails details =
2569         csm.getCloseRegionCoordination().getDetaultDetails();
2570       if (!closeRegion(region.getEncodedName(), abort, details, null)) {
2571         LOG.warn("Failed to close " + region.getRegionNameAsString() +
2572             " - ignoring and continuing");
2573       }
2574     } catch (IOException e) {
2575       LOG.warn("Failed to close " + region.getRegionNameAsString() +
2576           " - ignoring and continuing", e);
2577     }
2578   }
2579 
2580   /**
2581    * Close asynchronously a region, can be called from the master or internally by the regionserver
2582    * when stopping. If called from the master, the region will update the znode status.
2583    *
2584    * <p>
2585    * If an opening was in progress, this method will cancel it, but will not start a new close. The
2586    * coprocessors are not called in this case. A NotServingRegionException exception is thrown.
2587    * </p>
2588 
2589    * <p>
2590    *   If a close was in progress, this new request will be ignored, and an exception thrown.
2591    * </p>
2592    *
2593    * @param encodedName Region to close
2594    * @param abort True if we are aborting
2595    * @param crd details about closing region coordination-coordinated task
2596    * @return True if closed a region.
2597    * @throws NotServingRegionException if the region is not online
2598    * @throws RegionAlreadyInTransitionException if the region is already closing
2599    */
2600   protected boolean closeRegion(String encodedName, final boolean abort,
2601       CloseRegionCoordination.CloseRegionDetails crd, final ServerName sn)
2602       throws NotServingRegionException, RegionAlreadyInTransitionException {
2603     //Check for permissions to close.
2604     HRegion actualRegion = this.getFromOnlineRegions(encodedName);
2605     if ((actualRegion != null) && (actualRegion.getCoprocessorHost() != null)) {
2606       try {
2607         actualRegion.getCoprocessorHost().preClose(false);
2608       } catch (IOException exp) {
2609         LOG.warn("Unable to close region: the coprocessor launched an error ", exp);
2610         return false;
2611       }
2612     }
2613 
2614     final Boolean previous = this.regionsInTransitionInRS.putIfAbsent(encodedName.getBytes(),
2615         Boolean.FALSE);
2616 
2617     if (Boolean.TRUE.equals(previous)) {
2618       LOG.info("Received CLOSE for the region:" + encodedName + " , which we are already " +
2619           "trying to OPEN. Cancelling OPENING.");
2620       if (!regionsInTransitionInRS.replace(encodedName.getBytes(), previous, Boolean.FALSE)){
2621         // The replace failed. That should be an exceptional case, but theoretically it can happen.
2622         // We're going to try to do a standard close then.
2623         LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." +
2624             " Doing a standard close now");
2625         return closeRegion(encodedName, abort, crd, sn);
2626       }
2627       // Let's get the region from the online region list again
2628       actualRegion = this.getFromOnlineRegions(encodedName);
2629       if (actualRegion == null) { // If already online, we still need to close it.
2630         LOG.info("The opening previously in progress has been cancelled by a CLOSE request.");
2631         // The master deletes the znode when it receives this exception.
2632         throw new RegionAlreadyInTransitionException("The region " + encodedName +
2633           " was opening but not yet served. Opening is cancelled.");
2634       }
2635     } else if (Boolean.FALSE.equals(previous)) {
2636       LOG.info("Received CLOSE for the region: " + encodedName +
2637         ", which we are already trying to CLOSE, but not completed yet");
2638       // The master will retry till the region is closed. We need to do this since
2639       // the region could fail to close somehow. If we mark the region closed in master
2640       // while it is not, there could be data loss.
2641       // If the region stuck in closing for a while, and master runs out of retries,
2642       // master will move the region to failed_to_close. Later on, if the region
2643       // is indeed closed, master can properly re-assign it.
2644       throw new RegionAlreadyInTransitionException("The region " + encodedName +
2645         " was already closing. New CLOSE request is ignored.");
2646     }
2647 
2648     if (actualRegion == null) {
2649       LOG.error("Received CLOSE for a region which is not online, and we're not opening.");
2650       this.regionsInTransitionInRS.remove(encodedName.getBytes());
2651       // The master deletes the znode when it receives this exception.
2652       throw new NotServingRegionException("The region " + encodedName +
2653           " is not online, and is not opening.");
2654     }
2655 
2656     CloseRegionHandler crh;
2657     final HRegionInfo hri = actualRegion.getRegionInfo();
2658     if (hri.isMetaRegion()) {
2659       crh = new CloseMetaHandler(this, this, hri, abort,
2660         csm.getCloseRegionCoordination(), crd);
2661     } else {
2662       crh = new CloseRegionHandler(this, this, hri, abort,
2663         csm.getCloseRegionCoordination(), crd, sn);
2664     }
2665     this.service.submit(crh);
2666     return true;
2667   }
2668 
2669    /**
2670    * @param regionName
2671    * @return HRegion for the passed binary <code>regionName</code> or null if
2672    *         named region is not member of the online regions.
2673    */
2674   public HRegion getOnlineRegion(final byte[] regionName) {
2675     String encodedRegionName = HRegionInfo.encodeRegionName(regionName);
2676     return this.onlineRegions.get(encodedRegionName);
2677   }
2678 
2679   public InetSocketAddress[] getRegionBlockLocations(final String encodedRegionName) {
2680     return this.regionFavoredNodesMap.get(encodedRegionName);
2681   }
2682 
2683   @Override
2684   public HRegion getFromOnlineRegions(final String encodedRegionName) {
2685     return this.onlineRegions.get(encodedRegionName);
2686   }
2687 
2688 
2689   @Override
2690   public boolean removeFromOnlineRegions(final HRegion r, ServerName destination) {
2691     HRegion toReturn = this.onlineRegions.remove(r.getRegionInfo().getEncodedName());
2692 
2693     if (destination != null) {
2694       try {
2695         WAL wal = getWAL(r.getRegionInfo());
2696         long closeSeqNum = wal.getEarliestMemstoreSeqNum(r.getRegionInfo().getEncodedNameAsBytes());
2697         if (closeSeqNum == HConstants.NO_SEQNUM) {
2698           // No edits in WAL for this region; get the sequence number when the region was opened.
2699           closeSeqNum = r.getOpenSeqNum();
2700           if (closeSeqNum == HConstants.NO_SEQNUM) {
2701             closeSeqNum = 0;
2702           }
2703         }
2704         addToMovedRegions(r.getRegionInfo().getEncodedName(), destination, closeSeqNum);
2705       } catch (IOException exception) {
2706         LOG.error("Could not retrieve WAL information for region " + r.getRegionInfo() +
2707             "; not adding to moved regions.");
2708         LOG.debug("Exception details for failure to get wal", exception);
2709       }
2710     }
2711     this.regionFavoredNodesMap.remove(r.getRegionInfo().getEncodedName());
2712     return toReturn != null;
2713   }
2714 
2715   /**
2716    * Protected utility method for safely obtaining an HRegion handle.
2717    *
2718    * @param regionName
2719    *          Name of online {@link HRegion} to return
2720    * @return {@link HRegion} for <code>regionName</code>
2721    * @throws NotServingRegionException
2722    */
2723   protected HRegion getRegion(final byte[] regionName)
2724       throws NotServingRegionException {
2725     String encodedRegionName = HRegionInfo.encodeRegionName(regionName);
2726     return getRegionByEncodedName(regionName, encodedRegionName);
2727   }
2728 
2729   public HRegion getRegionByEncodedName(String encodedRegionName)
2730       throws NotServingRegionException {
2731     return getRegionByEncodedName(null, encodedRegionName);
2732   }
2733 
2734   protected HRegion getRegionByEncodedName(byte[] regionName, String encodedRegionName)
2735     throws NotServingRegionException {
2736     HRegion region = this.onlineRegions.get(encodedRegionName);
2737     if (region == null) {
2738       MovedRegionInfo moveInfo = getMovedRegion(encodedRegionName);
2739       if (moveInfo != null) {
2740         throw new RegionMovedException(moveInfo.getServerName(), moveInfo.getSeqNum());
2741       }
2742       Boolean isOpening = this.regionsInTransitionInRS.get(Bytes.toBytes(encodedRegionName));
2743       String regionNameStr = regionName == null?
2744         encodedRegionName: Bytes.toStringBinary(regionName);
2745       if (isOpening != null && isOpening.booleanValue()) {
2746         throw new RegionOpeningException("Region " + regionNameStr +
2747           " is opening on " + this.serverName);
2748       }
2749       throw new NotServingRegionException("Region " + regionNameStr +
2750         " is not online on " + this.serverName);
2751     }
2752     return region;
2753   }
2754 
2755   /*
2756    * Cleanup after Throwable caught invoking method. Converts <code>t</code> to
2757    * IOE if it isn't already.
2758    *
2759    * @param t Throwable
2760    *
2761    * @param msg Message to log in error. Can be null.
2762    *
2763    * @return Throwable converted to an IOE; methods can only let out IOEs.
2764    */
2765   private Throwable cleanup(final Throwable t, final String msg) {
2766     // Don't log as error if NSRE; NSRE is 'normal' operation.
2767     if (t instanceof NotServingRegionException) {
2768       LOG.debug("NotServingRegionException; " + t.getMessage());
2769       return t;
2770     }
2771     if (msg == null) {
2772       LOG.error("", RemoteExceptionHandler.checkThrowable(t));
2773     } else {
2774       LOG.error(msg, RemoteExceptionHandler.checkThrowable(t));
2775     }
2776     if (!rpcServices.checkOOME(t)) {
2777       checkFileSystem();
2778     }
2779     return t;
2780   }
2781 
2782   /*
2783    * @param t
2784    *
2785    * @param msg Message to put in new IOE if passed <code>t</code> is not an IOE
2786    *
2787    * @return Make <code>t</code> an IOE if it isn't already.
2788    */
2789   protected IOException convertThrowableToIOE(final Throwable t, final String msg) {
2790     return (t instanceof IOException ? (IOException) t : msg == null
2791         || msg.length() == 0 ? new IOException(t) : new IOException(msg, t));
2792   }
2793 
2794   /**
2795    * Checks to see if the file system is still accessible. If not, sets
2796    * abortRequested and stopRequested
2797    *
2798    * @return false if file system is not available
2799    */
2800   public boolean checkFileSystem() {
2801     if (this.fsOk && this.fs != null) {
2802       try {
2803         FSUtils.checkFileSystemAvailable(this.fs);
2804       } catch (IOException e) {
2805         abort("File System not available", e);
2806         this.fsOk = false;
2807       }
2808     }
2809     return this.fsOk;
2810   }
2811 
2812   @Override
2813   public void updateRegionFavoredNodesMapping(String encodedRegionName,
2814       List<org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.ServerName> favoredNodes) {
2815     InetSocketAddress[] addr = new InetSocketAddress[favoredNodes.size()];
2816     // Refer to the comment on the declaration of regionFavoredNodesMap on why
2817     // it is a map of region name to InetSocketAddress[]
2818     for (int i = 0; i < favoredNodes.size(); i++) {
2819       addr[i] = InetSocketAddress.createUnresolved(favoredNodes.get(i).getHostName(),
2820           favoredNodes.get(i).getPort());
2821     }
2822     regionFavoredNodesMap.put(encodedRegionName, addr);
2823   }
2824 
2825   /**
2826    * Return the favored nodes for a region given its encoded name. Look at the
2827    * comment around {@link #regionFavoredNodesMap} on why it is InetSocketAddress[]
2828    * @param encodedRegionName
2829    * @return array of favored locations
2830    */
2831   @Override
2832   public InetSocketAddress[] getFavoredNodesForRegion(String encodedRegionName) {
2833     return regionFavoredNodesMap.get(encodedRegionName);
2834   }
2835 
2836   @Override
2837   public ServerNonceManager getNonceManager() {
2838     return this.nonceManager;
2839   }
2840 
2841   private static class MovedRegionInfo {
2842     private final ServerName serverName;
2843     private final long seqNum;
2844     private final long ts;
2845 
2846     public MovedRegionInfo(ServerName serverName, long closeSeqNum) {
2847       this.serverName = serverName;
2848       this.seqNum = closeSeqNum;
2849       ts = EnvironmentEdgeManager.currentTime();
2850      }
2851 
2852     public ServerName getServerName() {
2853       return serverName;
2854     }
2855 
2856     public long getSeqNum() {
2857       return seqNum;
2858     }
2859 
2860     public long getMoveTime() {
2861       return ts;
2862     }
2863   }
2864 
2865   // This map will contains all the regions that we closed for a move.
2866   //  We add the time it was moved as we don't want to keep too old information
2867   protected Map<String, MovedRegionInfo> movedRegions =
2868       new ConcurrentHashMap<String, MovedRegionInfo>(3000);
2869 
2870   // We need a timeout. If not there is a risk of giving a wrong information: this would double
2871   //  the number of network calls instead of reducing them.
2872   private static final int TIMEOUT_REGION_MOVED = (2 * 60 * 1000);
2873 
2874   protected void addToMovedRegions(String encodedName, ServerName destination, long closeSeqNum) {
2875     if (ServerName.isSameHostnameAndPort(destination, this.getServerName())) {
2876       LOG.warn("Not adding moved region record: " + encodedName + " to self.");
2877       return;
2878     }
2879     LOG.info("Adding moved region record: "
2880       + encodedName + " to " + destination + " as of " + closeSeqNum);
2881     movedRegions.put(encodedName, new MovedRegionInfo(destination, closeSeqNum));
2882   }
2883 
2884   void removeFromMovedRegions(String encodedName) {
2885     movedRegions.remove(encodedName);
2886   }
2887 
2888   private MovedRegionInfo getMovedRegion(final String encodedRegionName) {
2889     MovedRegionInfo dest = movedRegions.get(encodedRegionName);
2890 
2891     long now = EnvironmentEdgeManager.currentTime();
2892     if (dest != null) {
2893       if (dest.getMoveTime() > (now - TIMEOUT_REGION_MOVED)) {
2894         return dest;
2895       } else {
2896         movedRegions.remove(encodedRegionName);
2897       }
2898     }
2899 
2900     return null;
2901   }
2902 
2903   /**
2904    * Remove the expired entries from the moved regions list.
2905    */
2906   protected void cleanMovedRegions() {
2907     final long cutOff = System.currentTimeMillis() - TIMEOUT_REGION_MOVED;
2908     Iterator<Entry<String, MovedRegionInfo>> it = movedRegions.entrySet().iterator();
2909 
2910     while (it.hasNext()){
2911       Map.Entry<String, MovedRegionInfo> e = it.next();
2912       if (e.getValue().getMoveTime() < cutOff) {
2913         it.remove();
2914       }
2915     }
2916   }
2917 
2918   /**
2919    * Creates a Chore thread to clean the moved region cache.
2920    */
2921   protected static class MovedRegionsCleaner extends Chore implements Stoppable {
2922     private HRegionServer regionServer;
2923     Stoppable stoppable;
2924 
2925     private MovedRegionsCleaner(
2926       HRegionServer regionServer, Stoppable stoppable){
2927       super("MovedRegionsCleaner for region "+regionServer, TIMEOUT_REGION_MOVED, stoppable);
2928       this.regionServer = regionServer;
2929       this.stoppable = stoppable;
2930     }
2931 
2932     static MovedRegionsCleaner createAndStart(HRegionServer rs){
2933       Stoppable stoppable = new Stoppable() {
2934         private volatile boolean isStopped = false;
2935         @Override public void stop(String why) { isStopped = true;}
2936         @Override public boolean isStopped() {return isStopped;}
2937       };
2938 
2939       return new MovedRegionsCleaner(rs, stoppable);
2940     }
2941 
2942     @Override
2943     protected void chore() {
2944       regionServer.cleanMovedRegions();
2945     }
2946 
2947     @Override
2948     public void stop(String why) {
2949       stoppable.stop(why);
2950     }
2951 
2952     @Override
2953     public boolean isStopped() {
2954       return stoppable.isStopped();
2955     }
2956   }
2957 
2958   private String getMyEphemeralNodePath() {
2959     return ZKUtil.joinZNode(this.zooKeeper.rsZNode, getServerName().toString());
2960   }
2961 
2962   private boolean isHealthCheckerConfigured() {
2963     String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
2964     return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
2965   }
2966 
2967   /**
2968    * @return the underlying {@link CompactSplitThread} for the servers
2969    */
2970   public CompactSplitThread getCompactSplitThread() {
2971     return this.compactSplitThread;
2972   }
2973 
2974   /**
2975    * A helper function to store the last flushed sequence Id with the previous failed RS for a
2976    * recovering region. The Id is used to skip wal edits which are flushed. Since the flushed
2977    * sequence id is only valid for each RS, we associate the Id with corresponding failed RS.
2978    * @throws KeeperException
2979    * @throws IOException
2980    */
2981   private void updateRecoveringRegionLastFlushedSequenceId(HRegion r) throws KeeperException,
2982       IOException {
2983     if (!r.isRecovering()) {
2984       // return immdiately for non-recovering regions
2985       return;
2986     }
2987 
2988     HRegionInfo region = r.getRegionInfo();
2989     ZooKeeperWatcher zkw = getZooKeeper();
2990     String previousRSName = this.getLastFailedRSFromZK(region.getEncodedName());
2991     Map<byte[], Long> maxSeqIdInStores = r.getMaxStoreSeqIdForLogReplay();
2992     long minSeqIdForLogReplay = -1;
2993     for (Long storeSeqIdForReplay : maxSeqIdInStores.values()) {
2994       if (minSeqIdForLogReplay == -1 || storeSeqIdForReplay < minSeqIdForLogReplay) {
2995         minSeqIdForLogReplay = storeSeqIdForReplay;
2996       }
2997     }
2998 
2999     try {
3000       long lastRecordedFlushedSequenceId = -1;
3001       String nodePath = ZKUtil.joinZNode(this.zooKeeper.recoveringRegionsZNode,
3002         region.getEncodedName());
3003       // recovering-region level
3004       byte[] data;
3005       try {
3006         data = ZKUtil.getData(zkw, nodePath);
3007       } catch (InterruptedException e) {
3008         throw new InterruptedIOException();
3009       }
3010       if (data != null) {
3011       lastRecordedFlushedSequenceId = ZKSplitLog.parseLastFlushedSequenceIdFrom(data);
3012       }
3013       if (data == null || lastRecordedFlushedSequenceId < minSeqIdForLogReplay) {
3014         ZKUtil.setData(zkw, nodePath, ZKUtil.positionToByteArray(minSeqIdForLogReplay));
3015       }
3016       if (previousRSName != null) {
3017         // one level deeper for the failed RS
3018         nodePath = ZKUtil.joinZNode(nodePath, previousRSName);
3019         ZKUtil.setData(zkw, nodePath,
3020           ZKUtil.regionSequenceIdsToByteArray(minSeqIdForLogReplay, maxSeqIdInStores));
3021         LOG.debug("Update last flushed sequence id of region " + region.getEncodedName() + " for "
3022             + previousRSName);
3023       } else {
3024         LOG.warn("Can't find failed region server for recovering region " +
3025           region.getEncodedName());
3026       }
3027     } catch (NoNodeException ignore) {
3028       LOG.debug("Region " + region.getEncodedName() +
3029         " must have completed recovery because its recovery znode has been removed", ignore);
3030     }
3031   }
3032 
3033   /**
3034    * Return the last failed RS name under /hbase/recovering-regions/encodedRegionName
3035    * @param encodedRegionName
3036    * @throws KeeperException
3037    */
3038   private String getLastFailedRSFromZK(String encodedRegionName) throws KeeperException {
3039     String result = null;
3040     long maxZxid = 0;
3041     ZooKeeperWatcher zkw = this.getZooKeeper();
3042     String nodePath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, encodedRegionName);
3043     List<String> failedServers = ZKUtil.listChildrenNoWatch(zkw, nodePath);
3044     if (failedServers == null || failedServers.isEmpty()) {
3045       return result;
3046     }
3047     for (String failedServer : failedServers) {
3048       String rsPath = ZKUtil.joinZNode(nodePath, failedServer);
3049       Stat stat = new Stat();
3050       ZKUtil.getDataNoWatch(zkw, rsPath, stat);
3051       if (maxZxid < stat.getCzxid()) {
3052         maxZxid = stat.getCzxid();
3053         result = failedServer;
3054       }
3055     }
3056     return result;
3057   }
3058 
3059   public CoprocessorServiceResponse execRegionServerService(final RpcController controller,
3060       final CoprocessorServiceRequest serviceRequest) throws ServiceException {
3061     try {
3062       ServerRpcController execController = new ServerRpcController();
3063       CoprocessorServiceCall call = serviceRequest.getCall();
3064       String serviceName = call.getServiceName();
3065       String methodName = call.getMethodName();
3066       if (!coprocessorServiceHandlers.containsKey(serviceName)) {
3067         throw new UnknownProtocolException(null,
3068             "No registered coprocessor service found for name " + serviceName);
3069       }
3070       Service service = coprocessorServiceHandlers.get(serviceName);
3071       Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
3072       Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
3073       if (methodDesc == null) {
3074         throw new UnknownProtocolException(service.getClass(), "Unknown method " + methodName
3075             + " called on service " + serviceName);
3076       }
3077       Message request =
3078           service.getRequestPrototype(methodDesc).newBuilderForType().mergeFrom(call.getRequest())
3079               .build();
3080       final Message.Builder responseBuilder =
3081           service.getResponsePrototype(methodDesc).newBuilderForType();
3082       service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
3083         @Override
3084         public void run(Message message) {
3085           if (message != null) {
3086             responseBuilder.mergeFrom(message);
3087           }
3088         }
3089       });
3090       Message execResult = responseBuilder.build();
3091       if (execController.getFailedOn() != null) {
3092         throw execController.getFailedOn();
3093       }
3094       ClientProtos.CoprocessorServiceResponse.Builder builder =
3095           ClientProtos.CoprocessorServiceResponse.newBuilder();
3096       builder.setRegion(RequestConverter.buildRegionSpecifier(RegionSpecifierType.REGION_NAME,
3097         HConstants.EMPTY_BYTE_ARRAY));
3098       builder.setValue(builder.getValueBuilder().setName(execResult.getClass().getName())
3099           .setValue(execResult.toByteString()));
3100       return builder.build();
3101     } catch (IOException ie) {
3102       throw new ServiceException(ie);
3103     }
3104   }
3105 
3106   /**
3107    * @return The cache config instance used by the regionserver.
3108    */
3109   public CacheConfig getCacheConfig() {
3110     return this.cacheConfig;
3111   }
3112 
3113   /**
3114    * @return : Returns the ConfigurationManager object for testing purposes.
3115    */
3116   protected ConfigurationManager getConfigurationManager() {
3117     return configurationManager;
3118   }
3119 
3120   /**
3121    * Reload the configuration from disk.
3122    */
3123   public void updateConfiguration() {
3124     LOG.info("Reloading the configuration from disk.");
3125     // Reload the configuration from disk.
3126     conf.reloadConfiguration();
3127     configurationManager.notifyAllObservers(conf);
3128   }
3129 
3130   @Override
3131   public HeapMemoryManager getHeapMemoryManager() {
3132     return hMemManager;
3133   }
3134 }