1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.regionserver.handler;
20
21 import java.io.IOException;
22 import java.util.Map;
23 import java.util.concurrent.atomic.AtomicBoolean;
24
25 import org.apache.commons.logging.Log;
26 import org.apache.commons.logging.LogFactory;
27 import org.apache.hadoop.classification.InterfaceAudience;
28 import org.apache.hadoop.hbase.HRegionInfo;
29 import org.apache.hadoop.hbase.HTableDescriptor;
30 import org.apache.hadoop.hbase.Server;
31 import org.apache.hadoop.hbase.executor.EventHandler;
32 import org.apache.hadoop.hbase.executor.EventType;
33 import org.apache.hadoop.hbase.master.AssignmentManager;
34 import org.apache.hadoop.hbase.regionserver.HRegion;
35 import org.apache.hadoop.hbase.regionserver.RegionServerAccounting;
36 import org.apache.hadoop.hbase.regionserver.RegionServerServices;
37 import org.apache.hadoop.hbase.util.CancelableProgressable;
38 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
39 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
40 import org.apache.zookeeper.KeeperException;
41
42
43
44
45
46 @InterfaceAudience.Private
47 public class OpenRegionHandler extends EventHandler {
48 private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
49
50 protected final RegionServerServices rsServices;
51
52 private final HRegionInfo regionInfo;
53 private final HTableDescriptor htd;
54
55 private boolean tomActivated;
56 private int assignmentTimeout;
57
58
59
60
61 private volatile int version = -1;
62
63 private volatile int versionOfOfflineNode = -1;
64
65 public OpenRegionHandler(final Server server,
66 final RegionServerServices rsServices, HRegionInfo regionInfo,
67 HTableDescriptor htd) {
68 this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION, -1);
69 }
70 public OpenRegionHandler(final Server server,
71 final RegionServerServices rsServices, HRegionInfo regionInfo,
72 HTableDescriptor htd, int versionOfOfflineNode) {
73 this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION,
74 versionOfOfflineNode);
75 }
76
77 protected OpenRegionHandler(final Server server,
78 final RegionServerServices rsServices, final HRegionInfo regionInfo,
79 final HTableDescriptor htd, EventType eventType,
80 final int versionOfOfflineNode) {
81 super(server, eventType);
82 this.rsServices = rsServices;
83 this.regionInfo = regionInfo;
84 this.htd = htd;
85 this.versionOfOfflineNode = versionOfOfflineNode;
86 tomActivated = this.server.getConfiguration().
87 getBoolean(AssignmentManager.ASSIGNMENT_TIMEOUT_MANAGEMENT,
88 AssignmentManager.DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT);
89 assignmentTimeout = this.server.getConfiguration().
90 getInt(AssignmentManager.ASSIGNMENT_TIMEOUT,
91 AssignmentManager.DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT);
92 }
93
94 public HRegionInfo getRegionInfo() {
95 return regionInfo;
96 }
97
98 @Override
99 public void process() throws IOException {
100 boolean openSuccessful = false;
101 boolean transitionedToOpening = false;
102 final String regionName = regionInfo.getRegionNameAsString();
103 HRegion region = null;
104
105 try {
106 if (this.server.isStopped() || this.rsServices.isStopping()) {
107 return;
108 }
109 final String encodedName = regionInfo.getEncodedName();
110
111
112
113
114
115
116
117 if (this.rsServices.getFromOnlineRegions(encodedName) != null) {
118 LOG.error("Region " + encodedName +
119 " was already online when we started processing the opening. " +
120 "Marking this new attempt as failed");
121 return;
122 }
123
124
125
126
127 if (!isRegionStillOpening()){
128 LOG.error("Region " + encodedName + " opening cancelled");
129 return;
130 }
131
132 if (!transitionZookeeperOfflineToOpening(encodedName, versionOfOfflineNode)) {
133 LOG.warn("Region was hijacked? Opening cancelled for encodedName=" + encodedName);
134
135 return;
136 }
137 transitionedToOpening = true;
138
139
140 region = openRegion();
141 if (region == null) {
142 return;
143 }
144
145
146 region.setRecovering(false);
147 Map<String, HRegion> recoveringRegions = this.rsServices.getRecoveringRegions();
148 if (recoveringRegions != null && !recoveringRegions.isEmpty()
149 && recoveringRegions.containsKey(region.getRegionInfo().getEncodedName())) {
150 region.setRecovering(true);
151 recoveringRegions.put(region.getRegionInfo().getEncodedName(), region);
152 }
153
154 boolean failed = true;
155 if (tickleOpening("post_region_open")) {
156 if (updateMeta(region)) {
157 failed = false;
158 }
159 }
160 if (failed || this.server.isStopped() ||
161 this.rsServices.isStopping()) {
162 return;
163 }
164
165
166 if (!isRegionStillOpening() || !transitionToOpened(region)) {
167
168
169
170
171
172 return;
173 }
174
175
176
177
178
179
180
181
182
183
184
185 this.rsServices.addToOnlineRegions(region);
186 openSuccessful = true;
187
188
189 LOG.debug("Opened " + regionName + " on " +
190 this.server.getServerName());
191
192
193 } finally {
194
195 if (!openSuccessful) {
196 doCleanUpOnFailedOpen(region, transitionedToOpening);
197 }
198 final Boolean current = this.rsServices.getRegionsInTransitionInRS().
199 remove(this.regionInfo.getEncodedNameAsBytes());
200
201
202
203
204
205
206
207
208 if (openSuccessful) {
209 if (current == null) {
210 LOG.error("Bad state: we've just opened a region that was NOT in transition. Region="
211 + regionName);
212 } else if (Boolean.FALSE.equals(current)) {
213
214 LOG.error("Race condition: we've finished to open a region, while a close was requested "
215 + " on region=" + regionName + ". It can be a critical error, as a region that"
216 + " should be closed is now opened. Closing it now");
217 cleanupFailedOpen(region);
218 }
219 }
220 }
221 }
222
223 private void doCleanUpOnFailedOpen(HRegion region, boolean transitionedToOpening)
224 throws IOException {
225 if (transitionedToOpening) {
226 try {
227 if (region != null) {
228 cleanupFailedOpen(region);
229 }
230 } finally {
231
232
233 tryTransitionFromOpeningToFailedOpen(regionInfo);
234 }
235 } else {
236
237
238 tryTransitionFromOfflineToFailedOpen(this.rsServices, regionInfo, versionOfOfflineNode);
239 }
240 }
241
242
243
244
245
246
247
248
249 boolean updateMeta(final HRegion r) {
250 if (this.server.isStopped() || this.rsServices.isStopping()) {
251 return false;
252 }
253
254
255 final AtomicBoolean signaller = new AtomicBoolean(false);
256 PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
257 this.server, this.rsServices, signaller);
258 t.start();
259
260
261 long timeout = assignmentTimeout * 10;
262 long now = System.currentTimeMillis();
263 long endTime = now + timeout;
264
265
266 long period = Math.max(1, assignmentTimeout/ 3);
267 long lastUpdate = now;
268 boolean tickleOpening = true;
269 while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
270 !this.rsServices.isStopping() && (endTime > now)) {
271 long elapsed = now - lastUpdate;
272 if (elapsed > period) {
273
274 lastUpdate = now;
275 tickleOpening = tickleOpening("post_open_deploy");
276 }
277 synchronized (signaller) {
278 try {
279 signaller.wait(period);
280 } catch (InterruptedException e) {
281
282 }
283 }
284 now = System.currentTimeMillis();
285 }
286
287
288 if (t.isAlive()) {
289 if (!signaller.get()) {
290
291 LOG.debug("Interrupting thread " + t);
292 t.interrupt();
293 }
294 try {
295 t.join();
296 } catch (InterruptedException ie) {
297 LOG.warn("Interrupted joining " +
298 r.getRegionInfo().getRegionNameAsString(), ie);
299 Thread.currentThread().interrupt();
300 }
301 }
302
303
304
305
306 return ((!Thread.interrupted() && t.getException() == null) && tickleOpening);
307 }
308
309
310
311
312
313
314
315
316 static class PostOpenDeployTasksThread extends Thread {
317 private Exception exception = null;
318 private final Server server;
319 private final RegionServerServices services;
320 private final HRegion region;
321 private final AtomicBoolean signaller;
322
323 PostOpenDeployTasksThread(final HRegion region, final Server server,
324 final RegionServerServices services, final AtomicBoolean signaller) {
325 super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
326 this.setDaemon(true);
327 this.server = server;
328 this.services = services;
329 this.region = region;
330 this.signaller = signaller;
331 }
332
333 public void run() {
334 try {
335 this.services.postOpenDeployTasks(this.region,
336 this.server.getCatalogTracker());
337 } catch (KeeperException e) {
338 server.abort("Exception running postOpenDeployTasks; region=" +
339 this.region.getRegionInfo().getEncodedName(), e);
340 } catch (Exception e) {
341 LOG.warn("Exception running postOpenDeployTasks; region=" +
342 this.region.getRegionInfo().getEncodedName(), e);
343 this.exception = e;
344 }
345
346 this.signaller.set(true);
347 synchronized (this.signaller) {
348 this.signaller.notify();
349 }
350 }
351
352
353
354
355 Exception getException() {
356 return this.exception;
357 }
358 }
359
360
361
362
363
364
365
366 boolean transitionToOpened(final HRegion r) throws IOException {
367 boolean result = false;
368 HRegionInfo hri = r.getRegionInfo();
369 final String name = hri.getRegionNameAsString();
370
371 try {
372 if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), hri,
373 this.server.getServerName(), this.version) == -1) {
374 String warnMsg = "Completed the OPEN of region " + name +
375 " but when transitioning from " + " OPENING to OPENED ";
376 try {
377 String node = ZKAssign.getNodeName(this.server.getZooKeeper(), hri.getEncodedName());
378 if (ZKUtil.checkExists(this.server.getZooKeeper(), node) < 0) {
379
380 rsServices.abort(warnMsg + "the znode disappeared", null);
381 } else {
382 LOG.warn(warnMsg + "got a version mismatch, someone else clashed; " +
383 "so now unassigning -- closing region on server: " + this.server.getServerName());
384 }
385 } catch (KeeperException ke) {
386 rsServices.abort(warnMsg, ke);
387 }
388 } else {
389 LOG.debug("Transitioned " + r.getRegionInfo().getEncodedName() +
390 " to OPENED in zk on " + this.server.getServerName());
391 result = true;
392 }
393 } catch (KeeperException e) {
394 LOG.error("Failed transitioning node " + name +
395 " from OPENING to OPENED -- closing region", e);
396 }
397 return result;
398 }
399
400
401
402
403
404
405 private boolean tryTransitionFromOpeningToFailedOpen(final HRegionInfo hri) {
406 boolean result = false;
407 final String name = hri.getRegionNameAsString();
408 try {
409 LOG.info("Opening of region " + hri + " failed, transitioning" +
410 " from OPENING to FAILED_OPEN in ZK, expecting version " + this.version);
411 if (ZKAssign.transitionNode(
412 this.server.getZooKeeper(), hri,
413 this.server.getServerName(),
414 EventType.RS_ZK_REGION_OPENING,
415 EventType.RS_ZK_REGION_FAILED_OPEN,
416 this.version) == -1) {
417 LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
418 "It's likely that the master already timed out this open " +
419 "attempt, and thus another RS already has the region.");
420 } else {
421 result = true;
422 }
423 } catch (KeeperException e) {
424 LOG.error("Failed transitioning node " + name +
425 " from OPENING to FAILED_OPEN", e);
426 }
427 return result;
428 }
429
430
431
432
433
434
435
436
437
438
439
440
441 public static boolean tryTransitionFromOfflineToFailedOpen(RegionServerServices rsServices,
442 final HRegionInfo hri, final int versionOfOfflineNode) {
443 boolean result = false;
444 final String name = hri.getRegionNameAsString();
445 try {
446 LOG.info("Opening of region " + hri + " failed, transitioning" +
447 " from OFFLINE to FAILED_OPEN in ZK, expecting version " + versionOfOfflineNode);
448 if (ZKAssign.transitionNode(
449 rsServices.getZooKeeper(), hri,
450 rsServices.getServerName(),
451 EventType.M_ZK_REGION_OFFLINE,
452 EventType.RS_ZK_REGION_FAILED_OPEN,
453 versionOfOfflineNode) == -1) {
454 LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
455 "It's likely that the master already timed out this open " +
456 "attempt, and thus another RS already has the region.");
457 } else {
458 result = true;
459 }
460 } catch (KeeperException e) {
461 LOG.error("Failed transitioning node " + name + " from OFFLINE to FAILED_OPEN", e);
462 }
463 return result;
464 }
465
466
467
468
469
470 HRegion openRegion() {
471 HRegion region = null;
472 try {
473
474
475 region = HRegion.openHRegion(this.regionInfo, this.htd,
476 this.rsServices.getWAL(this.regionInfo),
477 this.server.getConfiguration(),
478 this.rsServices,
479 new CancelableProgressable() {
480 public boolean progress() {
481
482
483
484 return tickleOpening("open_region_progress");
485 }
486 });
487 } catch (Throwable t) {
488
489
490
491 LOG.error(
492 "Failed open of region=" + this.regionInfo.getRegionNameAsString()
493 + ", starting to roll back the global memstore size.", t);
494
495 if (this.rsServices != null) {
496 RegionServerAccounting rsAccounting =
497 this.rsServices.getRegionServerAccounting();
498 if (rsAccounting != null) {
499 rsAccounting.rollbackRegionReplayEditsSize(this.regionInfo.getRegionName());
500 }
501 }
502 }
503 return region;
504 }
505
506 void cleanupFailedOpen(final HRegion region) throws IOException {
507 if (region != null) {
508 this.rsServices.removeFromOnlineRegions(region, null);
509 region.close();
510 }
511 }
512
513 private boolean isRegionStillOpening() {
514 byte[] encodedName = regionInfo.getEncodedNameAsBytes();
515 Boolean action = rsServices.getRegionsInTransitionInRS().get(encodedName);
516 return Boolean.TRUE.equals(action);
517 }
518
519
520
521
522
523
524
525
526
527 boolean transitionZookeeperOfflineToOpening(final String encodedName,
528 int versionOfOfflineNode) {
529
530 try {
531
532 this.version = ZKAssign.transitionNode(server.getZooKeeper(), regionInfo,
533 server.getServerName(), EventType.M_ZK_REGION_OFFLINE,
534 EventType.RS_ZK_REGION_OPENING, versionOfOfflineNode);
535 } catch (KeeperException e) {
536 LOG.error("Error transition from OFFLINE to OPENING for region=" +
537 encodedName, e);
538 this.version = -1;
539 return false;
540 }
541 boolean b = isGoodVersion();
542 if (!b) {
543 LOG.warn("Failed transition from OFFLINE to OPENING for region=" +
544 encodedName);
545 }
546 return b;
547 }
548
549
550
551
552
553
554
555 boolean tickleOpening(final String context) {
556 if (!isRegionStillOpening()) {
557 LOG.warn("Open region aborted since it isn't opening any more");
558 return false;
559 }
560
561 if (!isGoodVersion()) return false;
562 String encodedName = this.regionInfo.getEncodedName();
563 try {
564 this.version =
565 ZKAssign.retransitionNodeOpening(server.getZooKeeper(),
566 this.regionInfo, this.server.getServerName(), this.version, tomActivated);
567 } catch (KeeperException e) {
568 server.abort("Exception refreshing OPENING; region=" + encodedName +
569 ", context=" + context, e);
570 this.version = -1;
571 return false;
572 }
573 boolean b = isGoodVersion();
574 if (!b) {
575 LOG.warn("Failed refreshing OPENING; region=" + encodedName +
576 ", context=" + context);
577 }
578 return b;
579 }
580
581 private boolean isGoodVersion() {
582 return this.version != -1;
583 }
584 }