1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.regionserver.handler;
20
21 import java.io.IOException;
22 import java.util.concurrent.atomic.AtomicBoolean;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.hadoop.classification.InterfaceAudience;
27 import org.apache.hadoop.hbase.HRegionInfo;
28 import org.apache.hadoop.hbase.HTableDescriptor;
29 import org.apache.hadoop.hbase.Server;
30 import org.apache.hadoop.hbase.executor.EventHandler;
31 import org.apache.hadoop.hbase.executor.EventType;
32 import org.apache.hadoop.hbase.regionserver.HRegion;
33 import org.apache.hadoop.hbase.regionserver.RegionServerAccounting;
34 import org.apache.hadoop.hbase.regionserver.RegionServerServices;
35 import org.apache.hadoop.hbase.util.CancelableProgressable;
36 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
37 import org.apache.zookeeper.KeeperException;
38
39
40
41
42
43
44 @InterfaceAudience.Private
45 public class OpenRegionHandler extends EventHandler {
46 private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
47
48 protected final RegionServerServices rsServices;
49
50 private final HRegionInfo regionInfo;
51 private final HTableDescriptor htd;
52
53
54
55
56 private volatile int version = -1;
57
58 private volatile int versionOfOfflineNode = -1;
59
60 public OpenRegionHandler(final Server server,
61 final RegionServerServices rsServices, HRegionInfo regionInfo,
62 HTableDescriptor htd) {
63 this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION, -1);
64 }
65 public OpenRegionHandler(final Server server,
66 final RegionServerServices rsServices, HRegionInfo regionInfo,
67 HTableDescriptor htd, int versionOfOfflineNode) {
68 this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION,
69 versionOfOfflineNode);
70 }
71
72 protected OpenRegionHandler(final Server server,
73 final RegionServerServices rsServices, final HRegionInfo regionInfo,
74 final HTableDescriptor htd, EventType eventType,
75 final int versionOfOfflineNode) {
76 super(server, eventType);
77 this.rsServices = rsServices;
78 this.regionInfo = regionInfo;
79 this.htd = htd;
80 this.versionOfOfflineNode = versionOfOfflineNode;
81 }
82
83 public HRegionInfo getRegionInfo() {
84 return regionInfo;
85 }
86
87 @Override
88 public void process() throws IOException {
89 boolean openSuccessful = false;
90 boolean transitionedToOpening = false;
91 final String regionName = regionInfo.getRegionNameAsString();
92 HRegion region = null;
93
94 try {
95 if (this.server.isStopped() || this.rsServices.isStopping()) {
96 return;
97 }
98 final String encodedName = regionInfo.getEncodedName();
99
100
101
102
103
104
105
106 if (this.rsServices.getFromOnlineRegions(encodedName) != null) {
107 LOG.error("Region " + encodedName +
108 " was already online when we started processing the opening. " +
109 "Marking this new attempt as failed");
110 return;
111 }
112
113
114
115
116 if (!isRegionStillOpening()){
117 LOG.error("Region " + encodedName + " opening cancelled");
118 return;
119 }
120
121 if (!transitionZookeeperOfflineToOpening(encodedName, versionOfOfflineNode)) {
122 LOG.warn("Region was hijacked? Opening cancelled for encodedName=" + encodedName);
123
124 return;
125 }
126 transitionedToOpening = true;
127
128
129 region = openRegion();
130 if (region == null) {
131 return;
132 }
133 boolean failed = true;
134 if (tickleOpening("post_region_open")) {
135 if (updateMeta(region)) {
136 failed = false;
137 }
138 }
139 if (failed || this.server.isStopped() ||
140 this.rsServices.isStopping()) {
141 return;
142 }
143
144
145 if (!isRegionStillOpening() || !transitionToOpened(region)) {
146
147
148
149
150
151 return;
152 }
153
154
155
156
157
158
159
160
161
162
163
164 this.rsServices.addToOnlineRegions(region);
165 openSuccessful = true;
166
167
168 LOG.debug("Opened " + regionName + " on server:" +
169 this.server.getServerName());
170
171
172 } finally {
173
174 if (!openSuccessful) {
175 doCleanUpOnFailedOpen(region, transitionedToOpening);
176 }
177 final Boolean current = this.rsServices.getRegionsInTransitionInRS().
178 remove(this.regionInfo.getEncodedNameAsBytes());
179
180
181
182
183
184
185
186
187 if (openSuccessful) {
188 if (current == null) {
189 LOG.error("Bad state: we've just opened a region that was NOT in transition. Region="
190 + regionName);
191 } else if (Boolean.FALSE.equals(current)) {
192
193 LOG.error("Race condition: we've finished to open a region, while a close was requested "
194 + " on region=" + regionName + ". It can be a critical error, as a region that"
195 + " should be closed is now opened.");
196 }
197 }
198 }
199 }
200
201 private void doCleanUpOnFailedOpen(HRegion region, boolean transitionedToOpening)
202 throws IOException {
203 if (transitionedToOpening) {
204 try {
205 if (region != null) {
206 cleanupFailedOpen(region);
207 }
208 } finally {
209
210
211 tryTransitionFromOpeningToFailedOpen(regionInfo);
212 }
213 } else {
214
215
216 tryTransitionFromOfflineToFailedOpen(this.rsServices, regionInfo, versionOfOfflineNode);
217 }
218 }
219
220
221
222
223
224
225
226
227 boolean updateMeta(final HRegion r) {
228 if (this.server.isStopped() || this.rsServices.isStopping()) {
229 return false;
230 }
231
232
233 final AtomicBoolean signaller = new AtomicBoolean(false);
234 PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
235 this.server, this.rsServices, signaller);
236 t.start();
237 boolean tomActivated = this.server.getConfiguration().
238 getBoolean("hbase.assignment.timeout.management", false);
239 int assignmentTimeout = this.server.getConfiguration().
240 getInt("hbase.master.assignment.timeoutmonitor.period", 10000);
241
242
243 long timeout = assignmentTimeout * 10;
244 long now = System.currentTimeMillis();
245 long endTime = now + timeout;
246
247
248 long period = Math.max(1, assignmentTimeout/ 3);
249 long lastUpdate = now;
250 boolean tickleOpening = true;
251 while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
252 !this.rsServices.isStopping() && (endTime > now)) {
253 if (tomActivated) {
254 long elapsed = now - lastUpdate;
255 if (elapsed > period) {
256
257 lastUpdate = now;
258 tickleOpening = tickleOpening("post_open_deploy");
259 }
260 }
261 synchronized (signaller) {
262 try {
263 signaller.wait(period);
264 } catch (InterruptedException e) {
265
266 }
267 }
268 now = System.currentTimeMillis();
269 }
270
271
272 if (t.isAlive()) {
273 if (!signaller.get()) {
274
275 LOG.debug("Interrupting thread " + t);
276 t.interrupt();
277 }
278 try {
279 t.join();
280 } catch (InterruptedException ie) {
281 LOG.warn("Interrupted joining " +
282 r.getRegionInfo().getRegionNameAsString(), ie);
283 Thread.currentThread().interrupt();
284 }
285 }
286
287
288
289
290 return ((!Thread.interrupted() && t.getException() == null) && tickleOpening);
291 }
292
293
294
295
296
297
298
299
300 static class PostOpenDeployTasksThread extends Thread {
301 private Exception exception = null;
302 private final Server server;
303 private final RegionServerServices services;
304 private final HRegion region;
305 private final AtomicBoolean signaller;
306
307 PostOpenDeployTasksThread(final HRegion region, final Server server,
308 final RegionServerServices services, final AtomicBoolean signaller) {
309 super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
310 this.setDaemon(true);
311 this.server = server;
312 this.services = services;
313 this.region = region;
314 this.signaller = signaller;
315 }
316
317 public void run() {
318 try {
319 this.services.postOpenDeployTasks(this.region,
320 this.server.getCatalogTracker());
321 } catch (Exception e) {
322 LOG.warn("Exception running postOpenDeployTasks; region=" +
323 this.region.getRegionInfo().getEncodedName(), e);
324 this.exception = e;
325 }
326
327 this.signaller.set(true);
328 synchronized (this.signaller) {
329 this.signaller.notify();
330 }
331 }
332
333
334
335
336 Exception getException() {
337 return this.exception;
338 }
339 }
340
341
342
343
344
345
346
347 private boolean transitionToOpened(final HRegion r) throws IOException {
348 boolean result = false;
349 HRegionInfo hri = r.getRegionInfo();
350 final String name = hri.getRegionNameAsString();
351
352 try {
353 if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), hri,
354 this.server.getServerName(), this.version) == -1) {
355 LOG.warn("Completed the OPEN of region " + name +
356 " but when transitioning from " +
357 " OPENING to OPENED got a version mismatch, someone else clashed " +
358 "so now unassigning -- closing region on server: " +
359 this.server.getServerName());
360 } else {
361 LOG.debug("region transitioned to opened in zookeeper: " +
362 r.getRegionInfo() + ", server: " + this.server.getServerName());
363 result = true;
364 }
365 } catch (KeeperException e) {
366 LOG.error("Failed transitioning node " + name +
367 " from OPENING to OPENED -- closing region", e);
368 }
369 return result;
370 }
371
372
373
374
375
376
377 private boolean tryTransitionFromOpeningToFailedOpen(final HRegionInfo hri) {
378 boolean result = false;
379 final String name = hri.getRegionNameAsString();
380 try {
381 LOG.info("Opening of region " + hri + " failed, transitioning" +
382 " from OPENING to FAILED_OPEN in ZK, expecting version " + this.version);
383 if (ZKAssign.transitionNode(
384 this.server.getZooKeeper(), hri,
385 this.server.getServerName(),
386 EventType.RS_ZK_REGION_OPENING,
387 EventType.RS_ZK_REGION_FAILED_OPEN,
388 this.version) == -1) {
389 LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
390 "It's likely that the master already timed out this open " +
391 "attempt, and thus another RS already has the region.");
392 } else {
393 result = true;
394 }
395 } catch (KeeperException e) {
396 LOG.error("Failed transitioning node " + name +
397 " from OPENING to FAILED_OPEN", e);
398 }
399 return result;
400 }
401
402
403
404
405
406
407
408
409
410
411
412
413 public static boolean tryTransitionFromOfflineToFailedOpen(RegionServerServices rsServices,
414 final HRegionInfo hri, final int versionOfOfflineNode) {
415 boolean result = false;
416 final String name = hri.getRegionNameAsString();
417 try {
418 LOG.info("Opening of region " + hri + " failed, transitioning" +
419 " from OFFLINE to FAILED_OPEN in ZK, expecting version " + versionOfOfflineNode);
420 if (ZKAssign.transitionNode(
421 rsServices.getZooKeeper(), hri,
422 rsServices.getServerName(),
423 EventType.M_ZK_REGION_OFFLINE,
424 EventType.RS_ZK_REGION_FAILED_OPEN,
425 versionOfOfflineNode) == -1) {
426 LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
427 "It's likely that the master already timed out this open " +
428 "attempt, and thus another RS already has the region.");
429 } else {
430 result = true;
431 }
432 } catch (KeeperException e) {
433 LOG.error("Failed transitioning node " + name + " from OFFLINE to FAILED_OPEN", e);
434 }
435 return result;
436 }
437
438
439
440
441
442 HRegion openRegion() {
443 HRegion region = null;
444 try {
445
446
447 region = HRegion.openHRegion(this.regionInfo, this.htd,
448 this.rsServices.getWAL(this.regionInfo),
449 this.server.getConfiguration(),
450 this.rsServices,
451 new CancelableProgressable() {
452 public boolean progress() {
453
454
455
456 return tickleOpening("open_region_progress");
457 }
458 });
459 } catch (Throwable t) {
460
461
462
463 LOG.error(
464 "Failed open of region=" + this.regionInfo.getRegionNameAsString()
465 + ", starting to roll back the global memstore size.", t);
466
467 if (this.rsServices != null) {
468 RegionServerAccounting rsAccounting =
469 this.rsServices.getRegionServerAccounting();
470 if (rsAccounting != null) {
471 rsAccounting.rollbackRegionReplayEditsSize(this.regionInfo.getRegionName());
472 }
473 }
474 }
475 return region;
476 }
477
478 void cleanupFailedOpen(final HRegion region) throws IOException {
479 if (region != null) region.close();
480 }
481
482 private boolean isRegionStillOpening() {
483 byte[] encodedName = regionInfo.getEncodedNameAsBytes();
484 Boolean action = rsServices.getRegionsInTransitionInRS().get(encodedName);
485 return Boolean.TRUE.equals(action);
486 }
487
488
489
490
491
492
493
494
495
496 boolean transitionZookeeperOfflineToOpening(final String encodedName,
497 int versionOfOfflineNode) {
498
499 try {
500
501 this.version = ZKAssign.transitionNode(server.getZooKeeper(), regionInfo,
502 server.getServerName(), EventType.M_ZK_REGION_OFFLINE,
503 EventType.RS_ZK_REGION_OPENING, versionOfOfflineNode);
504 } catch (KeeperException e) {
505 LOG.error("Error transition from OFFLINE to OPENING for region=" +
506 encodedName, e);
507 this.version = -1;
508 return false;
509 }
510 boolean b = isGoodVersion();
511 if (!b) {
512 LOG.warn("Failed transition from OFFLINE to OPENING for region=" +
513 encodedName);
514 }
515 return b;
516 }
517
518
519
520
521
522
523
524 boolean tickleOpening(final String context) {
525 if (!isRegionStillOpening()) {
526 LOG.warn("Open region aborted since it isn't opening any more");
527 return false;
528 }
529
530 if (!isGoodVersion()) return false;
531 String encodedName = this.regionInfo.getEncodedName();
532 try {
533 this.version =
534 ZKAssign.retransitionNodeOpening(server.getZooKeeper(),
535 this.regionInfo, this.server.getServerName(), this.version);
536 } catch (KeeperException e) {
537 server.abort("Exception refreshing OPENING; region=" + encodedName +
538 ", context=" + context, e);
539 this.version = -1;
540 return false;
541 }
542 boolean b = isGoodVersion();
543 if (!b) {
544 LOG.warn("Failed refreshing OPENING; region=" + encodedName +
545 ", context=" + context);
546 }
547 return b;
548 }
549
550 private boolean isGoodVersion() {
551 return this.version != -1;
552 }
553 }