1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.regionserver;
20
21 import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
22 import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
23 import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
24
25 import java.io.IOException;
26 import java.util.ArrayList;
27 import java.util.List;
28 import java.util.ListIterator;
29 import java.util.Map;
30 import java.util.concurrent.Callable;
31 import java.util.concurrent.ExecutionException;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.ThreadFactory;
35 import java.util.concurrent.ThreadPoolExecutor;
36 import java.util.concurrent.TimeUnit;
37
38 import org.apache.commons.logging.Log;
39 import org.apache.commons.logging.LogFactory;
40 import org.apache.hadoop.classification.InterfaceAudience;
41 import org.apache.hadoop.hbase.HRegionInfo;
42 import org.apache.hadoop.hbase.RegionTransition;
43 import org.apache.hadoop.hbase.Server;
44 import org.apache.hadoop.hbase.ServerName;
45 import org.apache.hadoop.hbase.catalog.MetaEditor;
46 import org.apache.hadoop.hbase.executor.EventType;
47 import org.apache.hadoop.hbase.util.Bytes;
48 import org.apache.hadoop.hbase.util.CancelableProgressable;
49 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
50 import org.apache.hadoop.hbase.util.HasThread;
51 import org.apache.hadoop.hbase.util.PairOfSameType;
52 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
53 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
54 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
55 import org.apache.zookeeper.KeeperException;
56 import org.apache.zookeeper.KeeperException.NodeExistsException;
57 import org.apache.zookeeper.data.Stat;
58
59 import com.google.common.util.concurrent.ThreadFactoryBuilder;
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84 @InterfaceAudience.Private
85 public class SplitTransaction {
86 private static final Log LOG = LogFactory.getLog(SplitTransaction.class);
87
88
89
90
91 private final HRegion parent;
92 private HRegionInfo hri_a;
93 private HRegionInfo hri_b;
94 private long fileSplitTimeout = 30000;
95 private int znodeVersion = -1;
96
97
98
99
100 private final byte [] splitrow;
101
102
103
104
105
106
107 enum JournalEntry {
108
109
110
111 SET_SPLITTING_IN_ZK,
112
113
114
115 CREATE_SPLIT_DIR,
116
117
118
119 CLOSED_PARENT_REGION,
120
121
122
123 OFFLINED_PARENT,
124
125
126
127 STARTED_REGION_A_CREATION,
128
129
130
131 STARTED_REGION_B_CREATION,
132
133
134
135
136
137 PONR
138 }
139
140
141
142
143 private final List<JournalEntry> journal = new ArrayList<JournalEntry>();
144
145
146
147
148
149
150 public SplitTransaction(final HRegion r, final byte [] splitrow) {
151 this.parent = r;
152 this.splitrow = splitrow;
153 }
154
155
156
157
158
159
160 public boolean prepare() {
161 if (!this.parent.isSplittable()) return false;
162
163 if (this.splitrow == null) return false;
164 HRegionInfo hri = this.parent.getRegionInfo();
165 parent.prepareToSplit();
166
167 byte [] startKey = hri.getStartKey();
168 byte [] endKey = hri.getEndKey();
169 if (Bytes.equals(startKey, splitrow) ||
170 !this.parent.getRegionInfo().containsRow(splitrow)) {
171 LOG.info("Split row is not inside region key range or is equal to " +
172 "startkey: " + Bytes.toStringBinary(this.splitrow));
173 return false;
174 }
175 long rid = getDaughterRegionIdTimestamp(hri);
176 this.hri_a = new HRegionInfo(hri.getTable(), startKey, this.splitrow, false, rid);
177 this.hri_b = new HRegionInfo(hri.getTable(), this.splitrow, endKey, false, rid);
178 return true;
179 }
180
181
182
183
184
185
186 private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) {
187 long rid = EnvironmentEdgeManager.currentTimeMillis();
188
189
190 if (rid < hri.getRegionId()) {
191 LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() +
192 " but current time here is " + rid);
193 rid = hri.getRegionId() + 1;
194 }
195 return rid;
196 }
197
198 private static IOException closedByOtherException = new IOException(
199 "Failed to close region: already closed by another thread");
200
201
202
203
204
205
206
207
208
209
210
211 final RegionServerServices services) throws IOException {
212 LOG.info("Starting split of region " + this.parent);
213 if ((server != null && server.isStopped()) ||
214 (services != null && services.isStopping())) {
215 throw new IOException("Server is stopped or stopping");
216 }
217 assert !this.parent.lock.writeLock().isHeldByCurrentThread():
218 "Unsafe to hold write lock while performing RPCs";
219
220
221 if (this.parent.getCoprocessorHost() != null) {
222 this.parent.getCoprocessorHost().preSplit();
223 }
224
225
226 if (this.parent.getCoprocessorHost() != null) {
227 this.parent.getCoprocessorHost().preSplit(this.splitrow);
228 }
229
230
231 boolean testing = server == null? true:
232 server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
233 this.fileSplitTimeout = testing ? this.fileSplitTimeout :
234 server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
235 this.fileSplitTimeout);
236
237
238
239 if (server != null && server.getZooKeeper() != null) {
240 try {
241 createNodeSplitting(server.getZooKeeper(),
242 parent.getRegionInfo(), server.getServerName(), hri_a, hri_b);
243 } catch (KeeperException e) {
244 throw new IOException("Failed creating PENDING_SPLIT znode on " +
245 this.parent.getRegionNameAsString(), e);
246 }
247 }
248 this.journal.add(JournalEntry.SET_SPLITTING_IN_ZK);
249 if (server != null && server.getZooKeeper() != null) {
250
251
252
253 znodeVersion = getZKNode(server, services);
254 }
255
256 this.parent.getRegionFileSystem().createSplitsDir();
257 this.journal.add(JournalEntry.CREATE_SPLIT_DIR);
258
259 Map<byte[], List<StoreFile>> hstoreFilesToSplit = null;
260 Exception exceptionToThrow = null;
261 try{
262 hstoreFilesToSplit = this.parent.close(false);
263 } catch (Exception e) {
264 exceptionToThrow = e;
265 }
266 if (exceptionToThrow == null && hstoreFilesToSplit == null) {
267
268
269
270
271
272 exceptionToThrow = closedByOtherException;
273 }
274 if (exceptionToThrow != closedByOtherException) {
275 this.journal.add(JournalEntry.CLOSED_PARENT_REGION);
276 }
277 if (exceptionToThrow != null) {
278 if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
279 throw new IOException(exceptionToThrow);
280 }
281 if (!testing) {
282 services.removeFromOnlineRegions(this.parent, null);
283 }
284 this.journal.add(JournalEntry.OFFLINED_PARENT);
285
286
287
288
289
290
291
292 splitStoreFiles(hstoreFilesToSplit);
293
294
295
296
297
298 this.journal.add(JournalEntry.STARTED_REGION_A_CREATION);
299 HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a);
300
301
302 this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);
303 HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b);
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320 this.journal.add(JournalEntry.PONR);
321
322
323
324
325
326
327 if (!testing) {
328 MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(),
329 a.getRegionInfo(), b.getRegionInfo(), server.getServerName());
330 }
331 return new PairOfSameType<HRegion>(a, b);
332 }
333
334
335
336
337
338
339
340
341
342
343
344
345 final RegionServerServices services, HRegion a, HRegion b)
346 throws IOException {
347 boolean stopped = server != null && server.isStopped();
348 boolean stopping = services != null && services.isStopping();
349
350 if (stopped || stopping) {
351 LOG.info("Not opening daughters " +
352 b.getRegionInfo().getRegionNameAsString() +
353 " and " +
354 a.getRegionInfo().getRegionNameAsString() +
355 " because stopping=" + stopping + ", stopped=" + stopped);
356 } else {
357
358 DaughterOpener aOpener = new DaughterOpener(server, a);
359 DaughterOpener bOpener = new DaughterOpener(server, b);
360 aOpener.start();
361 bOpener.start();
362 try {
363 aOpener.join();
364 bOpener.join();
365 } catch (InterruptedException e) {
366 Thread.currentThread().interrupt();
367 throw new IOException("Interrupted " + e.getMessage());
368 }
369 if (aOpener.getException() != null) {
370 throw new IOException("Failed " +
371 aOpener.getName(), aOpener.getException());
372 }
373 if (bOpener.getException() != null) {
374 throw new IOException("Failed " +
375 bOpener.getName(), bOpener.getException());
376 }
377 if (services != null) {
378 try {
379
380 services.postOpenDeployTasks(b, server.getCatalogTracker());
381
382 services.addToOnlineRegions(b);
383 services.postOpenDeployTasks(a, server.getCatalogTracker());
384 services.addToOnlineRegions(a);
385 } catch (KeeperException ke) {
386 throw new IOException(ke);
387 }
388 }
389 }
390 }
391
392
393
394
395
396
397
398
399
400
401
402
403 final RegionServerServices services, HRegion a, HRegion b)
404 throws IOException {
405
406 if (server != null && server.getZooKeeper() != null) {
407 try {
408 this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
409 parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
410 server.getServerName(), this.znodeVersion,
411 RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
412
413 int spins = 0;
414
415
416
417 do {
418 if (spins % 10 == 0) {
419 LOG.debug("Still waiting on the master to process the split for " +
420 this.parent.getRegionInfo().getEncodedName());
421 }
422 Thread.sleep(100);
423
424 this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
425 parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
426 server.getServerName(), this.znodeVersion,
427 RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT);
428 spins++;
429 } while (this.znodeVersion != -1 && !server.isStopped()
430 && !services.isStopping());
431 } catch (Exception e) {
432 if (e instanceof InterruptedException) {
433 Thread.currentThread().interrupt();
434 }
435 throw new IOException("Failed telling master about split", e);
436 }
437 }
438
439
440 if (this.parent.getCoprocessorHost() != null) {
441 this.parent.getCoprocessorHost().postSplit(a,b);
442 }
443
444
445
446
447 }
448
449
450
451
452
453
454
455
456
457 private int getZKNode(final Server server,
458 final RegionServerServices services) throws IOException {
459
460 try {
461 int spins = 0;
462 Stat stat = new Stat();
463 ZooKeeperWatcher zkw = server.getZooKeeper();
464 ServerName expectedServer = server.getServerName();
465 String node = parent.getRegionInfo().getEncodedName();
466 while (!(server.isStopped() || services.isStopping())) {
467 if (spins % 5 == 0) {
468 LOG.debug("Still waiting for master to process "
469 + "the pending_split for " + node);
470 transitionSplittingNode(zkw, parent.getRegionInfo(),
471 hri_a, hri_b, expectedServer, -1, RS_ZK_REQUEST_REGION_SPLIT,
472 RS_ZK_REQUEST_REGION_SPLIT);
473 }
474 Thread.sleep(100);
475 spins++;
476 byte [] data = ZKAssign.getDataNoWatch(zkw, node, stat);
477 if (data == null) {
478 throw new IOException("Data is null, splitting node "
479 + node + " no longer exists");
480 }
481 RegionTransition rt = RegionTransition.parseFrom(data);
482 EventType et = rt.getEventType();
483 if (et == RS_ZK_REGION_SPLITTING) {
484 ServerName serverName = rt.getServerName();
485 if (!serverName.equals(expectedServer)) {
486 throw new IOException("Splitting node " + node + " is for "
487 + serverName + ", not us " + expectedServer);
488 }
489 byte [] payloadOfSplitting = rt.getPayload();
490 List<HRegionInfo> splittingRegions = HRegionInfo.parseDelimitedFrom(
491 payloadOfSplitting, 0, payloadOfSplitting.length);
492 assert splittingRegions.size() == 2;
493 HRegionInfo a = splittingRegions.get(0);
494 HRegionInfo b = splittingRegions.get(1);
495 if (!(hri_a.equals(a) && hri_b.equals(b))) {
496 throw new IOException("Splitting node " + node + " is for " + a + ", "
497 + b + ", not expected daughters: " + hri_a + ", " + hri_b);
498 }
499
500 return stat.getVersion();
501 }
502 if (et != RS_ZK_REQUEST_REGION_SPLIT) {
503 throw new IOException("Splitting node " + node
504 + " moved out of splitting to " + et);
505 }
506 }
507
508 throw new IOException("Server is "
509 + (services.isStopping() ? "stopping" : "stopped"));
510 } catch (Exception e) {
511 if (e instanceof InterruptedException) {
512 Thread.currentThread().interrupt();
513 }
514 throw new IOException("Failed getting SPLITTING znode on "
515 + parent.getRegionNameAsString(), e);
516 }
517 }
518
519
520
521
522
523
524
525
526
527
528
529
530 public PairOfSameType<HRegion> execute(final Server server,
531 final RegionServerServices services)
532 throws IOException {
533 PairOfSameType<HRegion> regions = createDaughters(server, services);
534 openDaughters(server, services, regions.getFirst(), regions.getSecond());
535 transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
536 return regions;
537 }
538
539
540
541
542
543 class DaughterOpener extends HasThread {
544 private final Server server;
545 private final HRegion r;
546 private Throwable t = null;
547
548 DaughterOpener(final Server s, final HRegion r) {
549 super((s == null? "null-services": s.getServerName()) +
550 "-daughterOpener=" + r.getRegionInfo().getEncodedName());
551 setDaemon(true);
552 this.server = s;
553 this.r = r;
554 }
555
556
557
558
559
560 Throwable getException() {
561 return this.t;
562 }
563
564 @Override
565 public void run() {
566 try {
567 openDaughterRegion(this.server, r);
568 } catch (Throwable t) {
569 this.t = t;
570 }
571 }
572 }
573
574
575
576
577
578
579
580
581 void openDaughterRegion(final Server server, final HRegion daughter)
582 throws IOException, KeeperException {
583 HRegionInfo hri = daughter.getRegionInfo();
584 LoggingProgressable reporter = server == null ? null
585 : new LoggingProgressable(hri, server.getConfiguration().getLong(
586 "hbase.regionserver.split.daughter.open.log.interval", 10000));
587 daughter.openHRegion(reporter);
588 }
589
590 static class LoggingProgressable implements CancelableProgressable {
591 private final HRegionInfo hri;
592 private long lastLog = -1;
593 private final long interval;
594
595 LoggingProgressable(final HRegionInfo hri, final long interval) {
596 this.hri = hri;
597 this.interval = interval;
598 }
599
600 @Override
601 public boolean progress() {
602 long now = System.currentTimeMillis();
603 if (now - lastLog > this.interval) {
604 LOG.info("Opening " + this.hri.getRegionNameAsString());
605 this.lastLog = now;
606 }
607 return true;
608 }
609 }
610
611 private void splitStoreFiles(final Map<byte[], List<StoreFile>> hstoreFilesToSplit)
612 throws IOException {
613 if (hstoreFilesToSplit == null) {
614
615 throw new IOException("Close returned empty list of StoreFiles");
616 }
617
618
619
620 int nbFiles = hstoreFilesToSplit.size();
621 if (nbFiles == 0) {
622
623 return;
624 }
625 ThreadFactoryBuilder builder = new ThreadFactoryBuilder();
626 builder.setNameFormat("StoreFileSplitter-%1$d");
627 ThreadFactory factory = builder.build();
628 ThreadPoolExecutor threadPool =
629 (ThreadPoolExecutor) Executors.newFixedThreadPool(nbFiles, factory);
630 List<Future<Void>> futures = new ArrayList<Future<Void>>(nbFiles);
631
632
633 for (Map.Entry<byte[], List<StoreFile>> entry: hstoreFilesToSplit.entrySet()) {
634 for (StoreFile sf: entry.getValue()) {
635 StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf);
636 futures.add(threadPool.submit(sfs));
637 }
638 }
639
640 threadPool.shutdown();
641
642
643 try {
644 boolean stillRunning = !threadPool.awaitTermination(
645 this.fileSplitTimeout, TimeUnit.MILLISECONDS);
646 if (stillRunning) {
647 threadPool.shutdownNow();
648
649 while (!threadPool.isTerminated()) {
650 Thread.sleep(50);
651 }
652 throw new IOException("Took too long to split the" +
653 " files and create the references, aborting split");
654 }
655 } catch (InterruptedException e) {
656 Thread.currentThread().interrupt();
657 throw new IOException("Interrupted while waiting for file splitters", e);
658 }
659
660
661 for (Future<Void> future: futures) {
662 try {
663 future.get();
664 } catch (InterruptedException e) {
665 Thread.currentThread().interrupt();
666 throw new IOException(
667 "Interrupted while trying to get the results of file splitters", e);
668 } catch (ExecutionException e) {
669 throw new IOException(e);
670 }
671 }
672 }
673
674 private void splitStoreFile(final byte[] family, final StoreFile sf) throws IOException {
675 HRegionFileSystem fs = this.parent.getRegionFileSystem();
676 String familyName = Bytes.toString(family);
677 fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false);
678 fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true);
679 }
680
681
682
683
684
685 class StoreFileSplitter implements Callable<Void> {
686 private final byte[] family;
687 private final StoreFile sf;
688
689
690
691
692
693
694 public StoreFileSplitter(final byte[] family, final StoreFile sf) {
695 this.sf = sf;
696 this.family = family;
697 }
698
699 public Void call() throws IOException {
700 splitStoreFile(family, sf);
701 return null;
702 }
703 }
704
705
706
707
708
709
710
711
712 @SuppressWarnings("deprecation")
713 public boolean rollback(final Server server, final RegionServerServices services)
714 throws IOException {
715
716 if (this.parent.getCoprocessorHost() != null) {
717 this.parent.getCoprocessorHost().preRollBackSplit();
718 }
719
720 boolean result = true;
721 ListIterator<JournalEntry> iterator =
722 this.journal.listIterator(this.journal.size());
723
724 while (iterator.hasPrevious()) {
725 JournalEntry je = iterator.previous();
726 switch(je) {
727
728 case SET_SPLITTING_IN_ZK:
729 if (server != null && server.getZooKeeper() != null) {
730 cleanZK(server, this.parent.getRegionInfo());
731 }
732 break;
733
734 case CREATE_SPLIT_DIR:
735 this.parent.writestate.writesEnabled = true;
736 this.parent.getRegionFileSystem().cleanupSplitsDir();
737 break;
738
739 case CLOSED_PARENT_REGION:
740 try {
741
742
743
744
745
746 this.parent.initialize();
747 } catch (IOException e) {
748 LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " +
749 this.parent.getRegionNameAsString(), e);
750 throw new RuntimeException(e);
751 }
752 break;
753
754 case STARTED_REGION_A_CREATION:
755 this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a);
756 break;
757
758 case STARTED_REGION_B_CREATION:
759 this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b);
760 break;
761
762 case OFFLINED_PARENT:
763 if (services != null) services.addToOnlineRegions(this.parent);
764 break;
765
766 case PONR:
767
768
769
770
771 return false;
772
773 default:
774 throw new RuntimeException("Unhandled journal entry: " + je);
775 }
776 }
777
778 if (this.parent.getCoprocessorHost() != null) {
779 this.parent.getCoprocessorHost().postRollBackSplit();
780 }
781 return result;
782 }
783
784 HRegionInfo getFirstDaughter() {
785 return hri_a;
786 }
787
788 HRegionInfo getSecondDaughter() {
789 return hri_b;
790 }
791
792 private static void cleanZK(final Server server, final HRegionInfo hri) {
793 try {
794
795 if (!ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
796 RS_ZK_REQUEST_REGION_SPLIT, server.getServerName())) {
797 ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
798 RS_ZK_REGION_SPLITTING, server.getServerName());
799 }
800 } catch (KeeperException.NoNodeException e) {
801 LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
802 } catch (KeeperException e) {
803 server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
804 }
805 }
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820 public static void createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region,
821 final ServerName serverName, final HRegionInfo a,
822 final HRegionInfo b) throws KeeperException, IOException {
823 LOG.debug(zkw.prefix("Creating ephemeral node for " +
824 region.getEncodedName() + " in PENDING_SPLIT state"));
825 byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
826 RegionTransition rt = RegionTransition.createRegionTransition(
827 RS_ZK_REQUEST_REGION_SPLIT, region.getRegionName(), serverName, payload);
828 String node = ZKAssign.getNodeName(zkw, region.getEncodedName());
829 if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) {
830 throw new IOException("Failed create of ephemeral " + node);
831 }
832 }
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868 public static int transitionSplittingNode(ZooKeeperWatcher zkw,
869 HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
870 final int znodeVersion, final EventType beginState,
871 final EventType endState) throws KeeperException, IOException {
872 byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
873 return ZKAssign.transitionNode(zkw, parent, serverName,
874 beginState, endState, znodeVersion, payload);
875 }
876 }