1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.regionserver.handler;
20
21 import java.io.IOException;
22 import java.util.Map;
23 import java.util.concurrent.atomic.AtomicBoolean;
24
25 import org.apache.commons.logging.Log;
26 import org.apache.commons.logging.LogFactory;
27 import org.apache.hadoop.classification.InterfaceAudience;
28 import org.apache.hadoop.hbase.HRegionInfo;
29 import org.apache.hadoop.hbase.HTableDescriptor;
30 import org.apache.hadoop.hbase.Server;
31 import org.apache.hadoop.hbase.executor.EventHandler;
32 import org.apache.hadoop.hbase.executor.EventType;
33 import org.apache.hadoop.hbase.regionserver.HRegion;
34 import org.apache.hadoop.hbase.regionserver.RegionServerAccounting;
35 import org.apache.hadoop.hbase.regionserver.RegionServerServices;
36 import org.apache.hadoop.hbase.util.CancelableProgressable;
37 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
38 import org.apache.zookeeper.KeeperException;
39
40
41
42
43
44
45 @InterfaceAudience.Private
46 public class OpenRegionHandler extends EventHandler {
47 private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
48
49 protected final RegionServerServices rsServices;
50
51 private final HRegionInfo regionInfo;
52 private final HTableDescriptor htd;
53
54 private boolean tomActivated;
55 private int assignmentTimeout;
56
57
58
59
60 private volatile int version = -1;
61
62 private volatile int versionOfOfflineNode = -1;
63
64 public OpenRegionHandler(final Server server,
65 final RegionServerServices rsServices, HRegionInfo regionInfo,
66 HTableDescriptor htd) {
67 this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION, -1);
68 }
69 public OpenRegionHandler(final Server server,
70 final RegionServerServices rsServices, HRegionInfo regionInfo,
71 HTableDescriptor htd, int versionOfOfflineNode) {
72 this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION,
73 versionOfOfflineNode);
74 }
75
76 protected OpenRegionHandler(final Server server,
77 final RegionServerServices rsServices, final HRegionInfo regionInfo,
78 final HTableDescriptor htd, EventType eventType,
79 final int versionOfOfflineNode) {
80 super(server, eventType);
81 this.rsServices = rsServices;
82 this.regionInfo = regionInfo;
83 this.htd = htd;
84 this.versionOfOfflineNode = versionOfOfflineNode;
85 tomActivated = this.server.getConfiguration().
86 getBoolean("hbase.assignment.timeout.management", false);
87 assignmentTimeout = this.server.getConfiguration().
88 getInt("hbase.master.assignment.timeoutmonitor.period", 10000);
89 }
90
91 public HRegionInfo getRegionInfo() {
92 return regionInfo;
93 }
94
95 @Override
96 public void process() throws IOException {
97 boolean openSuccessful = false;
98 boolean transitionedToOpening = false;
99 final String regionName = regionInfo.getRegionNameAsString();
100 HRegion region = null;
101
102 try {
103 if (this.server.isStopped() || this.rsServices.isStopping()) {
104 return;
105 }
106 final String encodedName = regionInfo.getEncodedName();
107
108
109
110
111
112
113
114 if (this.rsServices.getFromOnlineRegions(encodedName) != null) {
115 LOG.error("Region " + encodedName +
116 " was already online when we started processing the opening. " +
117 "Marking this new attempt as failed");
118 return;
119 }
120
121
122
123
124 if (!isRegionStillOpening()){
125 LOG.error("Region " + encodedName + " opening cancelled");
126 return;
127 }
128
129 if (!transitionZookeeperOfflineToOpening(encodedName, versionOfOfflineNode)) {
130 LOG.warn("Region was hijacked? Opening cancelled for encodedName=" + encodedName);
131
132 return;
133 }
134 transitionedToOpening = true;
135
136
137 region = openRegion();
138 if (region == null) {
139 return;
140 }
141
142
143 region.setRecovering(false);
144 Map<String, HRegion> recoveringRegions = this.rsServices.getRecoveringRegions();
145 if (recoveringRegions != null && !recoveringRegions.isEmpty()
146 && recoveringRegions.containsKey(region.getRegionInfo().getEncodedName())) {
147 region.setRecovering(true);
148 recoveringRegions.put(region.getRegionInfo().getEncodedName(), region);
149 }
150
151 boolean failed = true;
152 if (tickleOpening("post_region_open")) {
153 if (updateMeta(region)) {
154 failed = false;
155 }
156 }
157 if (failed || this.server.isStopped() ||
158 this.rsServices.isStopping()) {
159 return;
160 }
161
162
163 if (!isRegionStillOpening() || !transitionToOpened(region)) {
164
165
166
167
168
169 return;
170 }
171
172
173
174
175
176
177
178
179
180
181
182 this.rsServices.addToOnlineRegions(region);
183 openSuccessful = true;
184
185
186 LOG.debug("Opened " + regionName + " on server:" +
187 this.server.getServerName());
188
189
190 } finally {
191
192 if (!openSuccessful) {
193 doCleanUpOnFailedOpen(region, transitionedToOpening);
194 }
195 final Boolean current = this.rsServices.getRegionsInTransitionInRS().
196 remove(this.regionInfo.getEncodedNameAsBytes());
197
198
199
200
201
202
203
204
205 if (openSuccessful) {
206 if (current == null) {
207 LOG.error("Bad state: we've just opened a region that was NOT in transition. Region="
208 + regionName);
209 } else if (Boolean.FALSE.equals(current)) {
210
211 LOG.error("Race condition: we've finished to open a region, while a close was requested "
212 + " on region=" + regionName + ". It can be a critical error, as a region that"
213 + " should be closed is now opened.");
214 }
215 }
216 }
217 }
218
219 private void doCleanUpOnFailedOpen(HRegion region, boolean transitionedToOpening)
220 throws IOException {
221 if (transitionedToOpening) {
222 try {
223 if (region != null) {
224 cleanupFailedOpen(region);
225 }
226 } finally {
227
228
229 tryTransitionFromOpeningToFailedOpen(regionInfo);
230 }
231 } else {
232
233
234 tryTransitionFromOfflineToFailedOpen(this.rsServices, regionInfo, versionOfOfflineNode);
235 }
236 }
237
238
239
240
241
242
243
244
245 boolean updateMeta(final HRegion r) {
246 if (this.server.isStopped() || this.rsServices.isStopping()) {
247 return false;
248 }
249
250
251 final AtomicBoolean signaller = new AtomicBoolean(false);
252 PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
253 this.server, this.rsServices, signaller);
254 t.start();
255
256
257 long timeout = assignmentTimeout * 10;
258 long now = System.currentTimeMillis();
259 long endTime = now + timeout;
260
261
262 long period = Math.max(1, assignmentTimeout/ 3);
263 long lastUpdate = now;
264 boolean tickleOpening = true;
265 while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
266 !this.rsServices.isStopping() && (endTime > now)) {
267 long elapsed = now - lastUpdate;
268 if (elapsed > period) {
269
270 lastUpdate = now;
271 tickleOpening = tickleOpening("post_open_deploy");
272 }
273 synchronized (signaller) {
274 try {
275 signaller.wait(period);
276 } catch (InterruptedException e) {
277
278 }
279 }
280 now = System.currentTimeMillis();
281 }
282
283
284 if (t.isAlive()) {
285 if (!signaller.get()) {
286
287 LOG.debug("Interrupting thread " + t);
288 t.interrupt();
289 }
290 try {
291 t.join();
292 } catch (InterruptedException ie) {
293 LOG.warn("Interrupted joining " +
294 r.getRegionInfo().getRegionNameAsString(), ie);
295 Thread.currentThread().interrupt();
296 }
297 }
298
299
300
301
302 return ((!Thread.interrupted() && t.getException() == null) && tickleOpening);
303 }
304
305
306
307
308
309
310
311
312 static class PostOpenDeployTasksThread extends Thread {
313 private Exception exception = null;
314 private final Server server;
315 private final RegionServerServices services;
316 private final HRegion region;
317 private final AtomicBoolean signaller;
318
319 PostOpenDeployTasksThread(final HRegion region, final Server server,
320 final RegionServerServices services, final AtomicBoolean signaller) {
321 super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
322 this.setDaemon(true);
323 this.server = server;
324 this.services = services;
325 this.region = region;
326 this.signaller = signaller;
327 }
328
329 public void run() {
330 try {
331 this.services.postOpenDeployTasks(this.region,
332 this.server.getCatalogTracker());
333 } catch (Exception e) {
334 LOG.warn("Exception running postOpenDeployTasks; region=" +
335 this.region.getRegionInfo().getEncodedName(), e);
336 this.exception = e;
337 }
338
339 this.signaller.set(true);
340 synchronized (this.signaller) {
341 this.signaller.notify();
342 }
343 }
344
345
346
347
348 Exception getException() {
349 return this.exception;
350 }
351 }
352
353
354
355
356
357
358
359 private boolean transitionToOpened(final HRegion r) throws IOException {
360 boolean result = false;
361 HRegionInfo hri = r.getRegionInfo();
362 final String name = hri.getRegionNameAsString();
363
364 try {
365 if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), hri,
366 this.server.getServerName(), this.version) == -1) {
367 LOG.warn("Completed the OPEN of region " + name +
368 " but when transitioning from " +
369 " OPENING to OPENED got a version mismatch, someone else clashed " +
370 "so now unassigning -- closing region on server: " +
371 this.server.getServerName());
372 } else {
373 LOG.debug("region transitioned to opened in zookeeper: " +
374 r.getRegionInfo() + ", server: " + this.server.getServerName());
375 result = true;
376 }
377 } catch (KeeperException e) {
378 LOG.error("Failed transitioning node " + name +
379 " from OPENING to OPENED -- closing region", e);
380 }
381 return result;
382 }
383
384
385
386
387
388
389 private boolean tryTransitionFromOpeningToFailedOpen(final HRegionInfo hri) {
390 boolean result = false;
391 final String name = hri.getRegionNameAsString();
392 try {
393 LOG.info("Opening of region " + hri + " failed, transitioning" +
394 " from OPENING to FAILED_OPEN in ZK, expecting version " + this.version);
395 if (ZKAssign.transitionNode(
396 this.server.getZooKeeper(), hri,
397 this.server.getServerName(),
398 EventType.RS_ZK_REGION_OPENING,
399 EventType.RS_ZK_REGION_FAILED_OPEN,
400 this.version) == -1) {
401 LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
402 "It's likely that the master already timed out this open " +
403 "attempt, and thus another RS already has the region.");
404 } else {
405 result = true;
406 }
407 } catch (KeeperException e) {
408 LOG.error("Failed transitioning node " + name +
409 " from OPENING to FAILED_OPEN", e);
410 }
411 return result;
412 }
413
414
415
416
417
418
419
420
421
422
423
424
425 public static boolean tryTransitionFromOfflineToFailedOpen(RegionServerServices rsServices,
426 final HRegionInfo hri, final int versionOfOfflineNode) {
427 boolean result = false;
428 final String name = hri.getRegionNameAsString();
429 try {
430 LOG.info("Opening of region " + hri + " failed, transitioning" +
431 " from OFFLINE to FAILED_OPEN in ZK, expecting version " + versionOfOfflineNode);
432 if (ZKAssign.transitionNode(
433 rsServices.getZooKeeper(), hri,
434 rsServices.getServerName(),
435 EventType.M_ZK_REGION_OFFLINE,
436 EventType.RS_ZK_REGION_FAILED_OPEN,
437 versionOfOfflineNode) == -1) {
438 LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
439 "It's likely that the master already timed out this open " +
440 "attempt, and thus another RS already has the region.");
441 } else {
442 result = true;
443 }
444 } catch (KeeperException e) {
445 LOG.error("Failed transitioning node " + name + " from OFFLINE to FAILED_OPEN", e);
446 }
447 return result;
448 }
449
450
451
452
453
454 HRegion openRegion() {
455 HRegion region = null;
456 try {
457
458
459 region = HRegion.openHRegion(this.regionInfo, this.htd,
460 this.rsServices.getWAL(this.regionInfo),
461 this.server.getConfiguration(),
462 this.rsServices,
463 new CancelableProgressable() {
464 public boolean progress() {
465
466
467
468 return tickleOpening("open_region_progress");
469 }
470 });
471 } catch (Throwable t) {
472
473
474
475 LOG.error(
476 "Failed open of region=" + this.regionInfo.getRegionNameAsString()
477 + ", starting to roll back the global memstore size.", t);
478
479 if (this.rsServices != null) {
480 RegionServerAccounting rsAccounting =
481 this.rsServices.getRegionServerAccounting();
482 if (rsAccounting != null) {
483 rsAccounting.rollbackRegionReplayEditsSize(this.regionInfo.getRegionName());
484 }
485 }
486 }
487 return region;
488 }
489
490 void cleanupFailedOpen(final HRegion region) throws IOException {
491 if (region != null) region.close();
492 }
493
494 private boolean isRegionStillOpening() {
495 byte[] encodedName = regionInfo.getEncodedNameAsBytes();
496 Boolean action = rsServices.getRegionsInTransitionInRS().get(encodedName);
497 return Boolean.TRUE.equals(action);
498 }
499
500
501
502
503
504
505
506
507
508 boolean transitionZookeeperOfflineToOpening(final String encodedName,
509 int versionOfOfflineNode) {
510
511 try {
512
513 this.version = ZKAssign.transitionNode(server.getZooKeeper(), regionInfo,
514 server.getServerName(), EventType.M_ZK_REGION_OFFLINE,
515 EventType.RS_ZK_REGION_OPENING, versionOfOfflineNode);
516 } catch (KeeperException e) {
517 LOG.error("Error transition from OFFLINE to OPENING for region=" +
518 encodedName, e);
519 this.version = -1;
520 return false;
521 }
522 boolean b = isGoodVersion();
523 if (!b) {
524 LOG.warn("Failed transition from OFFLINE to OPENING for region=" +
525 encodedName);
526 }
527 return b;
528 }
529
530
531
532
533
534
535
536 boolean tickleOpening(final String context) {
537 if (!isRegionStillOpening()) {
538 LOG.warn("Open region aborted since it isn't opening any more");
539 return false;
540 }
541
542 if (!isGoodVersion()) return false;
543 String encodedName = this.regionInfo.getEncodedName();
544 try {
545 this.version =
546 ZKAssign.retransitionNodeOpening(server.getZooKeeper(),
547 this.regionInfo, this.server.getServerName(), this.version, tomActivated);
548 } catch (KeeperException e) {
549 server.abort("Exception refreshing OPENING; region=" + encodedName +
550 ", context=" + context, e);
551 this.version = -1;
552 return false;
553 }
554 boolean b = isGoodVersion();
555 if (!b) {
556 LOG.warn("Failed refreshing OPENING; region=" + encodedName +
557 ", context=" + context);
558 }
559 return b;
560 }
561
562 private boolean isGoodVersion() {
563 return this.version != -1;
564 }
565 }