1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.regionserver;
20
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.ListIterator;
25 import java.util.Map;
26 import java.util.concurrent.Callable;
27 import java.util.concurrent.ExecutionException;
28 import java.util.concurrent.Executors;
29 import java.util.concurrent.Future;
30 import java.util.concurrent.ThreadFactory;
31 import java.util.concurrent.ThreadPoolExecutor;
32 import java.util.concurrent.TimeUnit;
33
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.apache.hadoop.classification.InterfaceAudience;
37 import org.apache.hadoop.hbase.HRegionInfo;
38 import org.apache.hadoop.hbase.RegionTransition;
39 import org.apache.hadoop.hbase.Server;
40 import org.apache.hadoop.hbase.ServerName;
41 import org.apache.hadoop.hbase.catalog.MetaEditor;
42 import org.apache.hadoop.hbase.executor.EventType;
43 import org.apache.hadoop.hbase.util.Bytes;
44 import org.apache.hadoop.hbase.util.CancelableProgressable;
45 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
46 import org.apache.hadoop.hbase.util.HasThread;
47 import org.apache.hadoop.hbase.util.PairOfSameType;
48 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
49 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
50 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
51 import org.apache.zookeeper.KeeperException;
52 import org.apache.zookeeper.KeeperException.NodeExistsException;
53
54 import com.google.common.util.concurrent.ThreadFactoryBuilder;
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 @InterfaceAudience.Private
80 public class SplitTransaction {
81 private static final Log LOG = LogFactory.getLog(SplitTransaction.class);
82
83
84
85
86 private final HRegion parent;
87 private HRegionInfo hri_a;
88 private HRegionInfo hri_b;
89 private long fileSplitTimeout = 30000;
90 private int znodeVersion = -1;
91
92
93
94
95 private final byte [] splitrow;
96
97
98
99
100
101
102 enum JournalEntry {
103
104
105
106 SET_SPLITTING_IN_ZK,
107
108
109
110 CREATE_SPLIT_DIR,
111
112
113
114 CLOSED_PARENT_REGION,
115
116
117
118 OFFLINED_PARENT,
119
120
121
122 STARTED_REGION_A_CREATION,
123
124
125
126 STARTED_REGION_B_CREATION,
127
128
129
130
131
132 PONR
133 }
134
135
136
137
138 private final List<JournalEntry> journal = new ArrayList<JournalEntry>();
139
140
141
142
143
144
145 public SplitTransaction(final HRegion r, final byte [] splitrow) {
146 this.parent = r;
147 this.splitrow = splitrow;
148 }
149
150
151
152
153
154
155 public boolean prepare() {
156 if (!this.parent.isSplittable()) return false;
157
158 if (this.splitrow == null) return false;
159 HRegionInfo hri = this.parent.getRegionInfo();
160 parent.prepareToSplit();
161
162 byte [] startKey = hri.getStartKey();
163 byte [] endKey = hri.getEndKey();
164 if (Bytes.equals(startKey, splitrow) ||
165 !this.parent.getRegionInfo().containsRow(splitrow)) {
166 LOG.info("Split row is not inside region key range or is equal to " +
167 "startkey: " + Bytes.toStringBinary(this.splitrow));
168 return false;
169 }
170 long rid = getDaughterRegionIdTimestamp(hri);
171 this.hri_a = new HRegionInfo(hri.getTableName(), startKey, this.splitrow, false, rid);
172 this.hri_b = new HRegionInfo(hri.getTableName(), this.splitrow, endKey, false, rid);
173 return true;
174 }
175
176
177
178
179
180
181 private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) {
182 long rid = EnvironmentEdgeManager.currentTimeMillis();
183
184
185 if (rid < hri.getRegionId()) {
186 LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() +
187 " but current time here is " + rid);
188 rid = hri.getRegionId() + 1;
189 }
190 return rid;
191 }
192
193 private static IOException closedByOtherException = new IOException(
194 "Failed to close region: already closed by another thread");
195
196
197
198
199
200
201
202
203
204
205
206 final RegionServerServices services) throws IOException {
207 LOG.info("Starting split of region " + this.parent);
208 if ((server != null && server.isStopped()) ||
209 (services != null && services.isStopping())) {
210 throw new IOException("Server is stopped or stopping");
211 }
212 assert !this.parent.lock.writeLock().isHeldByCurrentThread():
213 "Unsafe to hold write lock while performing RPCs";
214
215
216 if (this.parent.getCoprocessorHost() != null) {
217 this.parent.getCoprocessorHost().preSplit();
218 }
219
220
221 if (this.parent.getCoprocessorHost() != null) {
222 this.parent.getCoprocessorHost().preSplit(this.splitrow);
223 }
224
225
226 boolean testing = server == null? true:
227 server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
228 this.fileSplitTimeout = testing ? this.fileSplitTimeout :
229 server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
230 this.fileSplitTimeout);
231
232
233
234 if (server != null && server.getZooKeeper() != null) {
235 try {
236 createNodeSplitting(server.getZooKeeper(),
237 this.parent.getRegionInfo(), server.getServerName());
238 } catch (KeeperException e) {
239 throw new IOException("Failed creating SPLITTING znode on " +
240 this.parent.getRegionNameAsString(), e);
241 }
242 }
243 this.journal.add(JournalEntry.SET_SPLITTING_IN_ZK);
244 if (server != null && server.getZooKeeper() != null) {
245 try {
246
247
248
249
250
251
252 this.znodeVersion = transitionNodeSplitting(server.getZooKeeper(),
253 this.parent.getRegionInfo(), server.getServerName(), -1);
254 } catch (KeeperException e) {
255 throw new IOException("Failed setting SPLITTING znode on "
256 + this.parent.getRegionNameAsString(), e);
257 }
258 }
259
260 this.parent.getRegionFileSystem().createSplitsDir();
261 this.journal.add(JournalEntry.CREATE_SPLIT_DIR);
262
263 Map<byte[], List<StoreFile>> hstoreFilesToSplit = null;
264 Exception exceptionToThrow = null;
265 try{
266 hstoreFilesToSplit = this.parent.close(false);
267 } catch (Exception e) {
268 exceptionToThrow = e;
269 }
270 if (exceptionToThrow == null && hstoreFilesToSplit == null) {
271
272
273
274
275
276 exceptionToThrow = closedByOtherException;
277 }
278 if (exceptionToThrow != closedByOtherException) {
279 this.journal.add(JournalEntry.CLOSED_PARENT_REGION);
280 }
281 if (exceptionToThrow != null) {
282 if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
283 throw new IOException(exceptionToThrow);
284 }
285 if (!testing) {
286 services.removeFromOnlineRegions(this.parent, null);
287 }
288 this.journal.add(JournalEntry.OFFLINED_PARENT);
289
290
291
292
293
294
295
296 splitStoreFiles(hstoreFilesToSplit);
297
298
299
300
301
302 this.journal.add(JournalEntry.STARTED_REGION_A_CREATION);
303 HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a);
304
305
306 this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);
307 HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b);
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324 this.journal.add(JournalEntry.PONR);
325
326
327
328
329
330
331 if (!testing) {
332 MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(),
333 a.getRegionInfo(), b.getRegionInfo(), server.getServerName());
334 }
335 return new PairOfSameType<HRegion>(a, b);
336 }
337
338
339
340
341
342
343
344
345
346
347
348
349 final RegionServerServices services, HRegion a, HRegion b)
350 throws IOException {
351 boolean stopped = server != null && server.isStopped();
352 boolean stopping = services != null && services.isStopping();
353
354 if (stopped || stopping) {
355 LOG.info("Not opening daughters " +
356 b.getRegionInfo().getRegionNameAsString() +
357 " and " +
358 a.getRegionInfo().getRegionNameAsString() +
359 " because stopping=" + stopping + ", stopped=" + stopped);
360 } else {
361
362 DaughterOpener aOpener = new DaughterOpener(server, a);
363 DaughterOpener bOpener = new DaughterOpener(server, b);
364 aOpener.start();
365 bOpener.start();
366 try {
367 aOpener.join();
368 bOpener.join();
369 } catch (InterruptedException e) {
370 Thread.currentThread().interrupt();
371 throw new IOException("Interrupted " + e.getMessage());
372 }
373 if (aOpener.getException() != null) {
374 throw new IOException("Failed " +
375 aOpener.getName(), aOpener.getException());
376 }
377 if (bOpener.getException() != null) {
378 throw new IOException("Failed " +
379 bOpener.getName(), bOpener.getException());
380 }
381 if (services != null) {
382 try {
383
384 services.postOpenDeployTasks(b, server.getCatalogTracker());
385
386 services.addToOnlineRegions(b);
387 services.postOpenDeployTasks(a, server.getCatalogTracker());
388 services.addToOnlineRegions(a);
389 } catch (KeeperException ke) {
390 throw new IOException(ke);
391 }
392 }
393 }
394 }
395
396
397
398
399
400
401
402
403
404
405
406
407 final RegionServerServices services, HRegion a, HRegion b)
408 throws IOException {
409
410 if (server != null && server.getZooKeeper() != null) {
411 try {
412 this.znodeVersion = transitionNodeSplit(server.getZooKeeper(),
413 parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
414 server.getServerName(), this.znodeVersion);
415
416 int spins = 0;
417
418
419
420 do {
421 if (spins % 10 == 0) {
422 LOG.debug("Still waiting on the master to process the split for " +
423 this.parent.getRegionInfo().getEncodedName());
424 }
425 Thread.sleep(100);
426
427 this.znodeVersion = tickleNodeSplit(server.getZooKeeper(),
428 parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
429 server.getServerName(), this.znodeVersion);
430 spins++;
431 } while (this.znodeVersion != -1 && !server.isStopped()
432 && !services.isStopping());
433 } catch (Exception e) {
434 if (e instanceof InterruptedException) {
435 Thread.currentThread().interrupt();
436 }
437 throw new IOException("Failed telling master about split", e);
438 }
439 }
440
441
442 if (this.parent.getCoprocessorHost() != null) {
443 this.parent.getCoprocessorHost().postSplit(a,b);
444 }
445
446
447
448
449 }
450
451
452
453
454
455
456
457
458
459
460
461
462 public PairOfSameType<HRegion> execute(final Server server,
463 final RegionServerServices services)
464 throws IOException {
465 PairOfSameType<HRegion> regions = createDaughters(server, services);
466 openDaughters(server, services, regions.getFirst(), regions.getSecond());
467 transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
468 return regions;
469 }
470
471
472
473
474
475 class DaughterOpener extends HasThread {
476 private final Server server;
477 private final HRegion r;
478 private Throwable t = null;
479
480 DaughterOpener(final Server s, final HRegion r) {
481 super((s == null? "null-services": s.getServerName()) +
482 "-daughterOpener=" + r.getRegionInfo().getEncodedName());
483 setDaemon(true);
484 this.server = s;
485 this.r = r;
486 }
487
488
489
490
491
492 Throwable getException() {
493 return this.t;
494 }
495
496 @Override
497 public void run() {
498 try {
499 openDaughterRegion(this.server, r);
500 } catch (Throwable t) {
501 this.t = t;
502 }
503 }
504 }
505
506
507
508
509
510
511
512
513 void openDaughterRegion(final Server server, final HRegion daughter)
514 throws IOException, KeeperException {
515 HRegionInfo hri = daughter.getRegionInfo();
516 LoggingProgressable reporter = server == null ? null
517 : new LoggingProgressable(hri, server.getConfiguration().getLong(
518 "hbase.regionserver.split.daughter.open.log.interval", 10000));
519 daughter.openHRegion(reporter);
520 }
521
522 static class LoggingProgressable implements CancelableProgressable {
523 private final HRegionInfo hri;
524 private long lastLog = -1;
525 private final long interval;
526
527 LoggingProgressable(final HRegionInfo hri, final long interval) {
528 this.hri = hri;
529 this.interval = interval;
530 }
531
532 @Override
533 public boolean progress() {
534 long now = System.currentTimeMillis();
535 if (now - lastLog > this.interval) {
536 LOG.info("Opening " + this.hri.getRegionNameAsString());
537 this.lastLog = now;
538 }
539 return true;
540 }
541 }
542
543 private void splitStoreFiles(final Map<byte[], List<StoreFile>> hstoreFilesToSplit)
544 throws IOException {
545 if (hstoreFilesToSplit == null) {
546
547 throw new IOException("Close returned empty list of StoreFiles");
548 }
549
550
551
552 int nbFiles = hstoreFilesToSplit.size();
553 if (nbFiles == 0) {
554
555 return;
556 }
557 ThreadFactoryBuilder builder = new ThreadFactoryBuilder();
558 builder.setNameFormat("StoreFileSplitter-%1$d");
559 ThreadFactory factory = builder.build();
560 ThreadPoolExecutor threadPool =
561 (ThreadPoolExecutor) Executors.newFixedThreadPool(nbFiles, factory);
562 List<Future<Void>> futures = new ArrayList<Future<Void>>(nbFiles);
563
564
565 for (Map.Entry<byte[], List<StoreFile>> entry: hstoreFilesToSplit.entrySet()) {
566 for (StoreFile sf: entry.getValue()) {
567 StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf);
568 futures.add(threadPool.submit(sfs));
569 }
570 }
571
572 threadPool.shutdown();
573
574
575 try {
576 boolean stillRunning = !threadPool.awaitTermination(
577 this.fileSplitTimeout, TimeUnit.MILLISECONDS);
578 if (stillRunning) {
579 threadPool.shutdownNow();
580
581 while (!threadPool.isTerminated()) {
582 Thread.sleep(50);
583 }
584 throw new IOException("Took too long to split the" +
585 " files and create the references, aborting split");
586 }
587 } catch (InterruptedException e) {
588 Thread.currentThread().interrupt();
589 throw new IOException("Interrupted while waiting for file splitters", e);
590 }
591
592
593 for (Future<Void> future: futures) {
594 try {
595 future.get();
596 } catch (InterruptedException e) {
597 Thread.currentThread().interrupt();
598 throw new IOException(
599 "Interrupted while trying to get the results of file splitters", e);
600 } catch (ExecutionException e) {
601 throw new IOException(e);
602 }
603 }
604 }
605
606 private void splitStoreFile(final byte[] family, final StoreFile sf) throws IOException {
607 HRegionFileSystem fs = this.parent.getRegionFileSystem();
608 String familyName = Bytes.toString(family);
609 fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false);
610 fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true);
611 }
612
613
614
615
616
617 class StoreFileSplitter implements Callable<Void> {
618 private final byte[] family;
619 private final StoreFile sf;
620
621
622
623
624
625
626 public StoreFileSplitter(final byte[] family, final StoreFile sf) {
627 this.sf = sf;
628 this.family = family;
629 }
630
631 public Void call() throws IOException {
632 splitStoreFile(family, sf);
633 return null;
634 }
635 }
636
637
638
639
640
641
642
643
644 public boolean rollback(final Server server, final RegionServerServices services)
645 throws IOException {
646
647 if (this.parent.getCoprocessorHost() != null) {
648 this.parent.getCoprocessorHost().preRollBackSplit();
649 }
650
651 boolean result = true;
652 ListIterator<JournalEntry> iterator =
653 this.journal.listIterator(this.journal.size());
654
655 while (iterator.hasPrevious()) {
656 JournalEntry je = iterator.previous();
657 switch(je) {
658
659 case SET_SPLITTING_IN_ZK:
660 if (server != null && server.getZooKeeper() != null) {
661 cleanZK(server, this.parent.getRegionInfo());
662 }
663 break;
664
665 case CREATE_SPLIT_DIR:
666 this.parent.writestate.writesEnabled = true;
667 this.parent.getRegionFileSystem().cleanupSplitsDir();
668 break;
669
670 case CLOSED_PARENT_REGION:
671 try {
672
673
674
675
676
677 this.parent.initialize();
678 } catch (IOException e) {
679 LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " +
680 this.parent.getRegionNameAsString(), e);
681 throw new RuntimeException(e);
682 }
683 break;
684
685 case STARTED_REGION_A_CREATION:
686 this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a);
687 break;
688
689 case STARTED_REGION_B_CREATION:
690 this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b);
691 break;
692
693 case OFFLINED_PARENT:
694 if (services != null) services.addToOnlineRegions(this.parent);
695 break;
696
697 case PONR:
698
699
700
701
702 return false;
703
704 default:
705 throw new RuntimeException("Unhandled journal entry: " + je);
706 }
707 }
708
709 if (this.parent.getCoprocessorHost() != null) {
710 this.parent.getCoprocessorHost().postRollBackSplit();
711 }
712 return result;
713 }
714
715 HRegionInfo getFirstDaughter() {
716 return hri_a;
717 }
718
719 HRegionInfo getSecondDaughter() {
720 return hri_b;
721 }
722
723 private static void cleanZK(final Server server, final HRegionInfo hri) {
724 try {
725
726 ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
727 EventType.RS_ZK_REGION_SPLITTING);
728 } catch (KeeperException e) {
729 server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
730 }
731 }
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747 int createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region,
748 final ServerName serverName) throws KeeperException, IOException {
749 LOG.debug(zkw.prefix("Creating ephemeral node for " +
750 region.getEncodedName() + " in SPLITTING state"));
751 RegionTransition rt = RegionTransition.createRegionTransition(EventType.RS_ZK_REGION_SPLITTING,
752 region.getRegionName(), serverName);
753 String node = ZKAssign.getNodeName(zkw, region.getEncodedName());
754 if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) {
755 throw new IOException("Failed create of ephemeral " + node);
756 }
757
758
759 return transitionNodeSplitting(zkw, region, serverName, -1);
760 }
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795 private static int transitionNodeSplit(ZooKeeperWatcher zkw,
796 HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
797 final int znodeVersion)
798 throws KeeperException, IOException {
799 byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
800 return ZKAssign.transitionNode(zkw, parent, serverName,
801 EventType.RS_ZK_REGION_SPLITTING, EventType.RS_ZK_REGION_SPLIT,
802 znodeVersion, payload);
803 }
804
805
806
807
808
809
810
811
812
813
814
815 int transitionNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo parent,
816 final ServerName serverName, final int version) throws KeeperException, IOException {
817 return ZKAssign.transitionNode(zkw, parent, serverName,
818 EventType.RS_ZK_REGION_SPLITTING, EventType.RS_ZK_REGION_SPLITTING, version);
819 }
820
821 private static int tickleNodeSplit(ZooKeeperWatcher zkw,
822 HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
823 final int znodeVersion)
824 throws KeeperException, IOException {
825 byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
826 return ZKAssign.transitionNode(zkw, parent, serverName,
827 EventType.RS_ZK_REGION_SPLIT, EventType.RS_ZK_REGION_SPLIT,
828 znodeVersion, payload);
829 }
830 }