1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.zookeeper;
20
21 import java.io.IOException;
22 import java.lang.management.ManagementFactory;
23 import java.util.ArrayList;
24 import java.util.LinkedList;
25 import java.util.List;
26 import java.util.Random;
27
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30 import org.apache.hadoop.hbase.classification.InterfaceAudience;
31 import org.apache.hadoop.hbase.util.Bytes;
32 import org.apache.hadoop.hbase.util.RetryCounter;
33 import org.apache.hadoop.hbase.util.RetryCounterFactory;
34 import org.apache.zookeeper.AsyncCallback;
35 import org.apache.zookeeper.CreateMode;
36 import org.apache.zookeeper.KeeperException;
37 import org.apache.zookeeper.Op;
38 import org.apache.zookeeper.OpResult;
39 import org.apache.zookeeper.Watcher;
40 import org.apache.zookeeper.ZooDefs;
41 import org.apache.zookeeper.ZooKeeper;
42 import org.apache.zookeeper.ZooKeeper.States;
43 import org.apache.zookeeper.data.ACL;
44 import org.apache.zookeeper.data.Stat;
45 import org.apache.zookeeper.proto.CreateRequest;
46 import org.apache.zookeeper.proto.SetDataRequest;
47 import org.cloudera.htrace.Trace;
48 import org.cloudera.htrace.TraceScope;
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 @InterfaceAudience.Private
74 public class RecoverableZooKeeper {
75 private static final Log LOG = LogFactory.getLog(RecoverableZooKeeper.class);
76
77 private ZooKeeper zk;
78 private final RetryCounterFactory retryCounterFactory;
79
80 private final String identifier;
81 private final byte[] id;
82 private Watcher watcher;
83 private int sessionTimeout;
84 private String quorumServers;
85 private final Random salter;
86
87
88
89
90
91
92
93
94
95 private static final byte MAGIC =(byte) 0XFF;
96 private static final int MAGIC_SIZE = Bytes.SIZEOF_BYTE;
97 private static final int ID_LENGTH_OFFSET = MAGIC_SIZE;
98 private static final int ID_LENGTH_SIZE = Bytes.SIZEOF_INT;
99
100 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DE_MIGHT_IGNORE",
101 justification="None. Its always been this way.")
102 public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
103 Watcher watcher, int maxRetries, int retryIntervalMillis)
104 throws IOException {
105 this(quorumServers, sessionTimeout, watcher, maxRetries, retryIntervalMillis,
106 null);
107 }
108
109 public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
110 Watcher watcher, int maxRetries, int retryIntervalMillis, String identifier)
111 throws IOException {
112
113 this.retryCounterFactory =
114 new RetryCounterFactory(maxRetries+1, retryIntervalMillis);
115
116 if (identifier == null || identifier.length() == 0) {
117
118 identifier = ManagementFactory.getRuntimeMXBean().getName();
119 }
120 LOG.info("Process identifier=" + identifier +
121 " connecting to ZooKeeper ensemble=" + quorumServers);
122 this.identifier = identifier;
123 this.id = Bytes.toBytes(identifier);
124
125 this.watcher = watcher;
126 this.sessionTimeout = sessionTimeout;
127 this.quorumServers = quorumServers;
128 try {checkZk();} catch (Exception x) {
129 salter = new Random();
130 }
131
132
133
134
135
136
137
138 protected synchronized ZooKeeper checkZk() throws KeeperException {
139 if (this.zk == null) {
140 try {
141 this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
142 } catch (IOException ex) {
143 LOG.warn("Unable to create ZooKeeper Connection", ex);
144 throw new KeeperException.OperationTimeoutException();
145 }
146 }
147 return zk;
148 }
149
150 public synchronized void reconnectAfterExpiration()
151 throws IOException, KeeperException, InterruptedException {
152 if (zk != null) {
153 LOG.info("Closing dead ZooKeeper connection, session" +
154 " was: 0x"+Long.toHexString(zk.getSessionId()));
155 zk.close();
156
157 zk = null;
158 }
159 checkZk();
160 LOG.info("Recreated a ZooKeeper, session" +
161 " is: 0x"+Long.toHexString(zk.getSessionId()));
162 }
163
164
165
166
167
168
169 public void delete(String path, int version)
170 throws InterruptedException, KeeperException {
171 TraceScope traceScope = null;
172 try {
173 traceScope = Trace.startSpan("RecoverableZookeeper.delete");
174 RetryCounter retryCounter = retryCounterFactory.create();
175 boolean isRetry = false;
176 while (true) {
177 try {
178 checkZk().delete(path, version);
179 return;
180 } catch (KeeperException e) {
181 switch (e.code()) {
182 case NONODE:
183 if (isRetry) {
184 LOG.debug("Node " + path + " already deleted. Assuming a " +
185 "previous attempt succeeded.");
186 return;
187 }
188 LOG.debug("Node " + path + " already deleted, retry=" + isRetry);
189 throw e;
190
191 case CONNECTIONLOSS:
192 case SESSIONEXPIRED:
193 case OPERATIONTIMEOUT:
194 retryOrThrow(retryCounter, e, "delete");
195 break;
196
197 default:
198 throw e;
199 }
200 }
201 retryCounter.sleepUntilNextRetry();
202 isRetry = true;
203 }
204 } finally {
205 if (traceScope != null) traceScope.close();
206 }
207 }
208
209
210
211
212
213 public Stat exists(String path, Watcher watcher)
214 throws KeeperException, InterruptedException {
215 TraceScope traceScope = null;
216 try {
217 traceScope = Trace.startSpan("RecoverableZookeeper.exists");
218 RetryCounter retryCounter = retryCounterFactory.create();
219 while (true) {
220 try {
221 return checkZk().exists(path, watcher);
222 } catch (KeeperException e) {
223 switch (e.code()) {
224 case CONNECTIONLOSS:
225 case SESSIONEXPIRED:
226 case OPERATIONTIMEOUT:
227 retryOrThrow(retryCounter, e, "exists");
228 break;
229
230 default:
231 throw e;
232 }
233 }
234 retryCounter.sleepUntilNextRetry();
235 }
236 } finally {
237 if (traceScope != null) traceScope.close();
238 }
239 }
240
241
242
243
244
245 public Stat exists(String path, boolean watch)
246 throws KeeperException, InterruptedException {
247 TraceScope traceScope = null;
248 try {
249 traceScope = Trace.startSpan("RecoverableZookeeper.exists");
250 RetryCounter retryCounter = retryCounterFactory.create();
251 while (true) {
252 try {
253 return checkZk().exists(path, watch);
254 } catch (KeeperException e) {
255 switch (e.code()) {
256 case CONNECTIONLOSS:
257 case SESSIONEXPIRED:
258 case OPERATIONTIMEOUT:
259 retryOrThrow(retryCounter, e, "exists");
260 break;
261
262 default:
263 throw e;
264 }
265 }
266 retryCounter.sleepUntilNextRetry();
267 }
268 } finally {
269 if (traceScope != null) traceScope.close();
270 }
271 }
272
273 private void retryOrThrow(RetryCounter retryCounter, KeeperException e,
274 String opName) throws KeeperException {
275 LOG.debug("Possibly transient ZooKeeper, quorum=" + quorumServers + ", exception=" + e);
276 if (!retryCounter.shouldRetry()) {
277 LOG.error("ZooKeeper " + opName + " failed after "
278 + retryCounter.getMaxAttempts() + " attempts");
279 throw e;
280 }
281 }
282
283
284
285
286
287 public List<String> getChildren(String path, Watcher watcher)
288 throws KeeperException, InterruptedException {
289 TraceScope traceScope = null;
290 try {
291 traceScope = Trace.startSpan("RecoverableZookeeper.getChildren");
292 RetryCounter retryCounter = retryCounterFactory.create();
293 while (true) {
294 try {
295 return checkZk().getChildren(path, watcher);
296 } catch (KeeperException e) {
297 switch (e.code()) {
298 case CONNECTIONLOSS:
299 case SESSIONEXPIRED:
300 case OPERATIONTIMEOUT:
301 retryOrThrow(retryCounter, e, "getChildren");
302 break;
303
304 default:
305 throw e;
306 }
307 }
308 retryCounter.sleepUntilNextRetry();
309 }
310 } finally {
311 if (traceScope != null) traceScope.close();
312 }
313 }
314
315
316
317
318
319 public List<String> getChildren(String path, boolean watch)
320 throws KeeperException, InterruptedException {
321 TraceScope traceScope = null;
322 try {
323 traceScope = Trace.startSpan("RecoverableZookeeper.getChildren");
324 RetryCounter retryCounter = retryCounterFactory.create();
325 while (true) {
326 try {
327 return checkZk().getChildren(path, watch);
328 } catch (KeeperException e) {
329 switch (e.code()) {
330 case CONNECTIONLOSS:
331 case SESSIONEXPIRED:
332 case OPERATIONTIMEOUT:
333 retryOrThrow(retryCounter, e, "getChildren");
334 break;
335
336 default:
337 throw e;
338 }
339 }
340 retryCounter.sleepUntilNextRetry();
341 }
342 } finally {
343 if (traceScope != null) traceScope.close();
344 }
345 }
346
347
348
349
350
351 public byte[] getData(String path, Watcher watcher, Stat stat)
352 throws KeeperException, InterruptedException {
353 TraceScope traceScope = null;
354 try {
355 traceScope = Trace.startSpan("RecoverableZookeeper.getData");
356 RetryCounter retryCounter = retryCounterFactory.create();
357 while (true) {
358 try {
359 byte[] revData = checkZk().getData(path, watcher, stat);
360 return this.removeMetaData(revData);
361 } catch (KeeperException e) {
362 switch (e.code()) {
363 case CONNECTIONLOSS:
364 case SESSIONEXPIRED:
365 case OPERATIONTIMEOUT:
366 retryOrThrow(retryCounter, e, "getData");
367 break;
368
369 default:
370 throw e;
371 }
372 }
373 retryCounter.sleepUntilNextRetry();
374 }
375 } finally {
376 if (traceScope != null) traceScope.close();
377 }
378 }
379
380
381
382
383
384 public byte[] getData(String path, boolean watch, Stat stat)
385 throws KeeperException, InterruptedException {
386 TraceScope traceScope = null;
387 try {
388 traceScope = Trace.startSpan("RecoverableZookeeper.getData");
389 RetryCounter retryCounter = retryCounterFactory.create();
390 while (true) {
391 try {
392 byte[] revData = checkZk().getData(path, watch, stat);
393 return this.removeMetaData(revData);
394 } catch (KeeperException e) {
395 switch (e.code()) {
396 case CONNECTIONLOSS:
397 case SESSIONEXPIRED:
398 case OPERATIONTIMEOUT:
399 retryOrThrow(retryCounter, e, "getData");
400 break;
401
402 default:
403 throw e;
404 }
405 }
406 retryCounter.sleepUntilNextRetry();
407 }
408 } finally {
409 if (traceScope != null) traceScope.close();
410 }
411 }
412
413
414
415
416
417
418
419 public Stat setData(String path, byte[] data, int version)
420 throws KeeperException, InterruptedException {
421 TraceScope traceScope = null;
422 try {
423 traceScope = Trace.startSpan("RecoverableZookeeper.setData");
424 RetryCounter retryCounter = retryCounterFactory.create();
425 byte[] newData = appendMetaData(data);
426 boolean isRetry = false;
427 while (true) {
428 try {
429 return checkZk().setData(path, newData, version);
430 } catch (KeeperException e) {
431 switch (e.code()) {
432 case CONNECTIONLOSS:
433 case SESSIONEXPIRED:
434 case OPERATIONTIMEOUT:
435 retryOrThrow(retryCounter, e, "setData");
436 break;
437 case BADVERSION:
438 if (isRetry) {
439
440 try{
441 Stat stat = new Stat();
442 byte[] revData = checkZk().getData(path, false, stat);
443 if(Bytes.compareTo(revData, newData) == 0) {
444
445 return stat;
446 }
447 } catch(KeeperException keeperException){
448
449 throw keeperException;
450 }
451 }
452
453 default:
454 throw e;
455 }
456 }
457 retryCounter.sleepUntilNextRetry();
458 isRetry = true;
459 }
460 } finally {
461 if (traceScope != null) traceScope.close();
462 }
463 }
464
465
466
467
468
469 public List<ACL> getAcl(String path, Stat stat)
470 throws KeeperException, InterruptedException {
471 TraceScope traceScope = null;
472 try {
473 traceScope = Trace.startSpan("RecoverableZookeeper.getAcl");
474 RetryCounter retryCounter = retryCounterFactory.create();
475 while (true) {
476 try {
477 return checkZk().getACL(path, stat);
478 } catch (KeeperException e) {
479 switch (e.code()) {
480 case CONNECTIONLOSS:
481 case SESSIONEXPIRED:
482 case OPERATIONTIMEOUT:
483 retryOrThrow(retryCounter, e, "getAcl");
484 break;
485
486 default:
487 throw e;
488 }
489 }
490 retryCounter.sleepUntilNextRetry();
491 }
492 } finally {
493 if (traceScope != null) traceScope.close();
494 }
495 }
496
497
498
499
500
501 public Stat setAcl(String path, List<ACL> acls, int version)
502 throws KeeperException, InterruptedException {
503 TraceScope traceScope = null;
504 try {
505 traceScope = Trace.startSpan("RecoverableZookeeper.setAcl");
506 RetryCounter retryCounter = retryCounterFactory.create();
507 while (true) {
508 try {
509 return checkZk().setACL(path, acls, version);
510 } catch (KeeperException e) {
511 switch (e.code()) {
512 case CONNECTIONLOSS:
513 case SESSIONEXPIRED:
514 case OPERATIONTIMEOUT:
515 retryOrThrow(retryCounter, e, "setAcl");
516 break;
517
518 default:
519 throw e;
520 }
521 }
522 retryCounter.sleepUntilNextRetry();
523 }
524 } finally {
525 if (traceScope != null) traceScope.close();
526 }
527 }
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544 public String create(String path, byte[] data, List<ACL> acl,
545 CreateMode createMode)
546 throws KeeperException, InterruptedException {
547 TraceScope traceScope = null;
548 try {
549 traceScope = Trace.startSpan("RecoverableZookeeper.create");
550 byte[] newData = appendMetaData(data);
551 switch (createMode) {
552 case EPHEMERAL:
553 case PERSISTENT:
554 return createNonSequential(path, newData, acl, createMode);
555
556 case EPHEMERAL_SEQUENTIAL:
557 case PERSISTENT_SEQUENTIAL:
558 return createSequential(path, newData, acl, createMode);
559
560 default:
561 throw new IllegalArgumentException("Unrecognized CreateMode: " +
562 createMode);
563 }
564 } finally {
565 if (traceScope != null) traceScope.close();
566 }
567 }
568
569 private String createNonSequential(String path, byte[] data, List<ACL> acl,
570 CreateMode createMode) throws KeeperException, InterruptedException {
571 RetryCounter retryCounter = retryCounterFactory.create();
572 boolean isRetry = false;
573 while (true) {
574 try {
575 return checkZk().create(path, data, acl, createMode);
576 } catch (KeeperException e) {
577 switch (e.code()) {
578 case NODEEXISTS:
579 if (isRetry) {
580
581
582
583 byte[] currentData = checkZk().getData(path, false, null);
584 if (currentData != null &&
585 Bytes.compareTo(currentData, data) == 0) {
586
587 return path;
588 }
589 LOG.error("Node " + path + " already exists with " +
590 Bytes.toStringBinary(currentData) + ", could not write " +
591 Bytes.toStringBinary(data));
592 throw e;
593 }
594 LOG.info("Node " + path + " already exists and this is not a " +
595 "retry");
596 throw e;
597
598 case CONNECTIONLOSS:
599 case SESSIONEXPIRED:
600 case OPERATIONTIMEOUT:
601 retryOrThrow(retryCounter, e, "create");
602 break;
603
604 default:
605 throw e;
606 }
607 }
608 retryCounter.sleepUntilNextRetry();
609 isRetry = true;
610 }
611 }
612
613 private String createSequential(String path, byte[] data,
614 List<ACL> acl, CreateMode createMode)
615 throws KeeperException, InterruptedException {
616 RetryCounter retryCounter = retryCounterFactory.create();
617 boolean first = true;
618 String newPath = path+this.identifier;
619 while (true) {
620 try {
621 if (!first) {
622
623 String previousResult = findPreviousSequentialNode(newPath);
624 if (previousResult != null) {
625 return previousResult;
626 }
627 }
628 first = false;
629 return checkZk().create(newPath, data, acl, createMode);
630 } catch (KeeperException e) {
631 switch (e.code()) {
632 case CONNECTIONLOSS:
633 case SESSIONEXPIRED:
634 case OPERATIONTIMEOUT:
635 retryOrThrow(retryCounter, e, "create");
636 break;
637
638 default:
639 throw e;
640 }
641 }
642 retryCounter.sleepUntilNextRetry();
643 }
644 }
645
646
647
648
649 private Iterable<Op> prepareZKMulti(Iterable<Op> ops)
650 throws UnsupportedOperationException {
651 if(ops == null) return null;
652
653 List<Op> preparedOps = new LinkedList<Op>();
654 for (Op op : ops) {
655 if (op.getType() == ZooDefs.OpCode.create) {
656 CreateRequest create = (CreateRequest)op.toRequestRecord();
657 preparedOps.add(Op.create(create.getPath(), appendMetaData(create.getData()),
658 create.getAcl(), create.getFlags()));
659 } else if (op.getType() == ZooDefs.OpCode.delete) {
660
661 preparedOps.add(op);
662 } else if (op.getType() == ZooDefs.OpCode.setData) {
663 SetDataRequest setData = (SetDataRequest)op.toRequestRecord();
664 preparedOps.add(Op.setData(setData.getPath(), appendMetaData(setData.getData()),
665 setData.getVersion()));
666 } else {
667 throw new UnsupportedOperationException("Unexpected ZKOp type: " + op.getClass().getName());
668 }
669 }
670 return preparedOps;
671 }
672
673
674
675
676 public List<OpResult> multi(Iterable<Op> ops)
677 throws KeeperException, InterruptedException {
678 TraceScope traceScope = null;
679 try {
680 traceScope = Trace.startSpan("RecoverableZookeeper.multi");
681 RetryCounter retryCounter = retryCounterFactory.create();
682 Iterable<Op> multiOps = prepareZKMulti(ops);
683 while (true) {
684 try {
685 return checkZk().multi(multiOps);
686 } catch (KeeperException e) {
687 switch (e.code()) {
688 case CONNECTIONLOSS:
689 case SESSIONEXPIRED:
690 case OPERATIONTIMEOUT:
691 retryOrThrow(retryCounter, e, "multi");
692 break;
693
694 default:
695 throw e;
696 }
697 }
698 retryCounter.sleepUntilNextRetry();
699 }
700 } finally {
701 if (traceScope != null) traceScope.close();
702 }
703 }
704
705 private String findPreviousSequentialNode(String path)
706 throws KeeperException, InterruptedException {
707 int lastSlashIdx = path.lastIndexOf('/');
708 assert(lastSlashIdx != -1);
709 String parent = path.substring(0, lastSlashIdx);
710 String nodePrefix = path.substring(lastSlashIdx+1);
711
712 List<String> nodes = checkZk().getChildren(parent, false);
713 List<String> matching = filterByPrefix(nodes, nodePrefix);
714 for (String node : matching) {
715 String nodePath = parent + "/" + node;
716 Stat stat = checkZk().exists(nodePath, false);
717 if (stat != null) {
718 return nodePath;
719 }
720 }
721 return null;
722 }
723
724 public byte[] removeMetaData(byte[] data) {
725 if(data == null || data.length == 0) {
726 return data;
727 }
728
729 byte magic = data[0];
730 if(magic != MAGIC) {
731 return data;
732 }
733
734 int idLength = Bytes.toInt(data, ID_LENGTH_OFFSET);
735 int dataLength = data.length-MAGIC_SIZE-ID_LENGTH_SIZE-idLength;
736 int dataOffset = MAGIC_SIZE+ID_LENGTH_SIZE+idLength;
737
738 byte[] newData = new byte[dataLength];
739 System.arraycopy(data, dataOffset, newData, 0, dataLength);
740 return newData;
741 }
742
743 private byte[] appendMetaData(byte[] data) {
744 if(data == null || data.length == 0){
745 return data;
746 }
747 byte[] salt = Bytes.toBytes(salter.nextLong());
748 int idLength = id.length + salt.length;
749 byte[] newData = new byte[MAGIC_SIZE+ID_LENGTH_SIZE+idLength+data.length];
750 int pos = 0;
751 pos = Bytes.putByte(newData, pos, MAGIC);
752 pos = Bytes.putInt(newData, pos, idLength);
753 pos = Bytes.putBytes(newData, pos, id, 0, id.length);
754 pos = Bytes.putBytes(newData, pos, salt, 0, salt.length);
755 pos = Bytes.putBytes(newData, pos, data, 0, data.length);
756 return newData;
757 }
758
759 public synchronized long getSessionId() {
760 return zk == null ? null : zk.getSessionId();
761 }
762
763 public synchronized void close() throws InterruptedException {
764 if (zk != null) zk.close();
765 }
766
767 public synchronized States getState() {
768 return zk == null ? null : zk.getState();
769 }
770
771 public synchronized ZooKeeper getZooKeeper() {
772 return zk;
773 }
774
775 public synchronized byte[] getSessionPasswd() {
776 return zk == null ? null : zk.getSessionPasswd();
777 }
778
779 public void sync(String path, AsyncCallback.VoidCallback cb, Object ctx) throws KeeperException {
780 checkZk().sync(path, null, null);
781 }
782
783
784
785
786
787
788
789
790
791
792 private static List<String> filterByPrefix(List<String> nodes,
793 String... prefixes) {
794 List<String> lockChildren = new ArrayList<String>();
795 for (String child : nodes){
796 for (String prefix : prefixes){
797 if (child.startsWith(prefix)){
798 lockChildren.add(child);
799 break;
800 }
801 }
802 }
803 return lockChildren;
804 }
805
806 public String getIdentifier() {
807 return identifier;
808 }
809 }