1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.zookeeper;
20
21 import java.io.IOException;
22 import java.lang.management.ManagementFactory;
23 import java.security.SecureRandom;
24 import java.util.ArrayList;
25 import java.util.LinkedList;
26 import java.util.List;
27 import java.util.Random;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.hbase.classification.InterfaceAudience;
32 import org.apache.hadoop.hbase.util.Bytes;
33 import org.apache.hadoop.hbase.util.RetryCounter;
34 import org.apache.hadoop.hbase.util.RetryCounterFactory;
35 import org.apache.zookeeper.AsyncCallback;
36 import org.apache.zookeeper.CreateMode;
37 import org.apache.zookeeper.KeeperException;
38 import org.apache.zookeeper.Op;
39 import org.apache.zookeeper.OpResult;
40 import org.apache.zookeeper.Watcher;
41 import org.apache.zookeeper.ZooDefs;
42 import org.apache.zookeeper.ZooKeeper;
43 import org.apache.zookeeper.ZooKeeper.States;
44 import org.apache.zookeeper.data.ACL;
45 import org.apache.zookeeper.data.Stat;
46 import org.apache.zookeeper.proto.CreateRequest;
47 import org.apache.zookeeper.proto.SetDataRequest;
48 import org.cloudera.htrace.Trace;
49 import org.cloudera.htrace.TraceScope;
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 @InterfaceAudience.Private
75 public class RecoverableZooKeeper {
76 private static final Log LOG = LogFactory.getLog(RecoverableZooKeeper.class);
77
78 private ZooKeeper zk;
79 private final RetryCounterFactory retryCounterFactory;
80
81 private final String identifier;
82 private final byte[] id;
83 private Watcher watcher;
84 private int sessionTimeout;
85 private String quorumServers;
86 private final Random salter;
87
88
89
90
91
92
93
94
95
96 private static final byte MAGIC =(byte) 0XFF;
97 private static final int MAGIC_SIZE = Bytes.SIZEOF_BYTE;
98 private static final int ID_LENGTH_OFFSET = MAGIC_SIZE;
99 private static final int ID_LENGTH_SIZE = Bytes.SIZEOF_INT;
100
101 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DE_MIGHT_IGNORE",
102 justification="None. Its always been this way.")
103 public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
104 Watcher watcher, int maxRetries, int retryIntervalMillis)
105 throws IOException {
106 this(quorumServers, sessionTimeout, watcher, maxRetries, retryIntervalMillis,
107 null);
108 }
109
110 public RecoverableZooKeeper(String quorumServers, int sessionTimeout,
111 Watcher watcher, int maxRetries, int retryIntervalMillis, String identifier)
112 throws IOException {
113
114 this.retryCounterFactory =
115 new RetryCounterFactory(maxRetries+1, retryIntervalMillis);
116
117 if (identifier == null || identifier.length() == 0) {
118
119 identifier = ManagementFactory.getRuntimeMXBean().getName();
120 }
121 LOG.info("Process identifier=" + identifier +
122 " connecting to ZooKeeper ensemble=" + quorumServers);
123 this.identifier = identifier;
124 this.id = Bytes.toBytes(identifier);
125
126 this.watcher = watcher;
127 this.sessionTimeout = sessionTimeout;
128 this.quorumServers = quorumServers;
129 try {checkZk();} catch (Exception x) {
130 salter = new SecureRandom();
131 }
132
133
134
135
136
137
138
139 protected synchronized ZooKeeper checkZk() throws KeeperException {
140 if (this.zk == null) {
141 try {
142 this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
143 } catch (IOException ex) {
144 LOG.warn("Unable to create ZooKeeper Connection", ex);
145 throw new KeeperException.OperationTimeoutException();
146 }
147 }
148 return zk;
149 }
150
151 public synchronized void reconnectAfterExpiration()
152 throws IOException, KeeperException, InterruptedException {
153 if (zk != null) {
154 LOG.info("Closing dead ZooKeeper connection, session" +
155 " was: 0x"+Long.toHexString(zk.getSessionId()));
156 zk.close();
157
158 zk = null;
159 }
160 checkZk();
161 LOG.info("Recreated a ZooKeeper, session" +
162 " is: 0x"+Long.toHexString(zk.getSessionId()));
163 }
164
165
166
167
168
169
170 public void delete(String path, int version)
171 throws InterruptedException, KeeperException {
172 TraceScope traceScope = null;
173 try {
174 traceScope = Trace.startSpan("RecoverableZookeeper.delete");
175 RetryCounter retryCounter = retryCounterFactory.create();
176 boolean isRetry = false;
177 while (true) {
178 try {
179 checkZk().delete(path, version);
180 return;
181 } catch (KeeperException e) {
182 switch (e.code()) {
183 case NONODE:
184 if (isRetry) {
185 LOG.debug("Node " + path + " already deleted. Assuming a " +
186 "previous attempt succeeded.");
187 return;
188 }
189 LOG.debug("Node " + path + " already deleted, retry=" + isRetry);
190 throw e;
191
192 case CONNECTIONLOSS:
193 case SESSIONEXPIRED:
194 case OPERATIONTIMEOUT:
195 retryOrThrow(retryCounter, e, "delete");
196 break;
197
198 default:
199 throw e;
200 }
201 }
202 retryCounter.sleepUntilNextRetry();
203 isRetry = true;
204 }
205 } finally {
206 if (traceScope != null) traceScope.close();
207 }
208 }
209
210
211
212
213
214 public Stat exists(String path, Watcher watcher)
215 throws KeeperException, InterruptedException {
216 TraceScope traceScope = null;
217 try {
218 traceScope = Trace.startSpan("RecoverableZookeeper.exists");
219 RetryCounter retryCounter = retryCounterFactory.create();
220 while (true) {
221 try {
222 return checkZk().exists(path, watcher);
223 } catch (KeeperException e) {
224 switch (e.code()) {
225 case CONNECTIONLOSS:
226 case SESSIONEXPIRED:
227 case OPERATIONTIMEOUT:
228 retryOrThrow(retryCounter, e, "exists");
229 break;
230
231 default:
232 throw e;
233 }
234 }
235 retryCounter.sleepUntilNextRetry();
236 }
237 } finally {
238 if (traceScope != null) traceScope.close();
239 }
240 }
241
242
243
244
245
246 public Stat exists(String path, boolean watch)
247 throws KeeperException, InterruptedException {
248 TraceScope traceScope = null;
249 try {
250 traceScope = Trace.startSpan("RecoverableZookeeper.exists");
251 RetryCounter retryCounter = retryCounterFactory.create();
252 while (true) {
253 try {
254 return checkZk().exists(path, watch);
255 } catch (KeeperException e) {
256 switch (e.code()) {
257 case CONNECTIONLOSS:
258 case SESSIONEXPIRED:
259 case OPERATIONTIMEOUT:
260 retryOrThrow(retryCounter, e, "exists");
261 break;
262
263 default:
264 throw e;
265 }
266 }
267 retryCounter.sleepUntilNextRetry();
268 }
269 } finally {
270 if (traceScope != null) traceScope.close();
271 }
272 }
273
274 private void retryOrThrow(RetryCounter retryCounter, KeeperException e,
275 String opName) throws KeeperException {
276 LOG.debug("Possibly transient ZooKeeper, quorum=" + quorumServers + ", exception=" + e);
277 if (!retryCounter.shouldRetry()) {
278 LOG.error("ZooKeeper " + opName + " failed after "
279 + retryCounter.getMaxAttempts() + " attempts");
280 throw e;
281 }
282 }
283
284
285
286
287
288 public List<String> getChildren(String path, Watcher watcher)
289 throws KeeperException, InterruptedException {
290 TraceScope traceScope = null;
291 try {
292 traceScope = Trace.startSpan("RecoverableZookeeper.getChildren");
293 RetryCounter retryCounter = retryCounterFactory.create();
294 while (true) {
295 try {
296 return checkZk().getChildren(path, watcher);
297 } catch (KeeperException e) {
298 switch (e.code()) {
299 case CONNECTIONLOSS:
300 case SESSIONEXPIRED:
301 case OPERATIONTIMEOUT:
302 retryOrThrow(retryCounter, e, "getChildren");
303 break;
304
305 default:
306 throw e;
307 }
308 }
309 retryCounter.sleepUntilNextRetry();
310 }
311 } finally {
312 if (traceScope != null) traceScope.close();
313 }
314 }
315
316
317
318
319
320 public List<String> getChildren(String path, boolean watch)
321 throws KeeperException, InterruptedException {
322 TraceScope traceScope = null;
323 try {
324 traceScope = Trace.startSpan("RecoverableZookeeper.getChildren");
325 RetryCounter retryCounter = retryCounterFactory.create();
326 while (true) {
327 try {
328 return checkZk().getChildren(path, watch);
329 } catch (KeeperException e) {
330 switch (e.code()) {
331 case CONNECTIONLOSS:
332 case SESSIONEXPIRED:
333 case OPERATIONTIMEOUT:
334 retryOrThrow(retryCounter, e, "getChildren");
335 break;
336
337 default:
338 throw e;
339 }
340 }
341 retryCounter.sleepUntilNextRetry();
342 }
343 } finally {
344 if (traceScope != null) traceScope.close();
345 }
346 }
347
348
349
350
351
352 public byte[] getData(String path, Watcher watcher, Stat stat)
353 throws KeeperException, InterruptedException {
354 TraceScope traceScope = null;
355 try {
356 traceScope = Trace.startSpan("RecoverableZookeeper.getData");
357 RetryCounter retryCounter = retryCounterFactory.create();
358 while (true) {
359 try {
360 byte[] revData = checkZk().getData(path, watcher, stat);
361 return this.removeMetaData(revData);
362 } catch (KeeperException e) {
363 switch (e.code()) {
364 case CONNECTIONLOSS:
365 case SESSIONEXPIRED:
366 case OPERATIONTIMEOUT:
367 retryOrThrow(retryCounter, e, "getData");
368 break;
369
370 default:
371 throw e;
372 }
373 }
374 retryCounter.sleepUntilNextRetry();
375 }
376 } finally {
377 if (traceScope != null) traceScope.close();
378 }
379 }
380
381
382
383
384
385 public byte[] getData(String path, boolean watch, Stat stat)
386 throws KeeperException, InterruptedException {
387 TraceScope traceScope = null;
388 try {
389 traceScope = Trace.startSpan("RecoverableZookeeper.getData");
390 RetryCounter retryCounter = retryCounterFactory.create();
391 while (true) {
392 try {
393 byte[] revData = checkZk().getData(path, watch, stat);
394 return this.removeMetaData(revData);
395 } catch (KeeperException e) {
396 switch (e.code()) {
397 case CONNECTIONLOSS:
398 case SESSIONEXPIRED:
399 case OPERATIONTIMEOUT:
400 retryOrThrow(retryCounter, e, "getData");
401 break;
402
403 default:
404 throw e;
405 }
406 }
407 retryCounter.sleepUntilNextRetry();
408 }
409 } finally {
410 if (traceScope != null) traceScope.close();
411 }
412 }
413
414
415
416
417
418
419
420 public Stat setData(String path, byte[] data, int version)
421 throws KeeperException, InterruptedException {
422 TraceScope traceScope = null;
423 try {
424 traceScope = Trace.startSpan("RecoverableZookeeper.setData");
425 RetryCounter retryCounter = retryCounterFactory.create();
426 byte[] newData = appendMetaData(data);
427 boolean isRetry = false;
428 while (true) {
429 try {
430 return checkZk().setData(path, newData, version);
431 } catch (KeeperException e) {
432 switch (e.code()) {
433 case CONNECTIONLOSS:
434 case SESSIONEXPIRED:
435 case OPERATIONTIMEOUT:
436 retryOrThrow(retryCounter, e, "setData");
437 break;
438 case BADVERSION:
439 if (isRetry) {
440
441 try{
442 Stat stat = new Stat();
443 byte[] revData = checkZk().getData(path, false, stat);
444 if(Bytes.compareTo(revData, newData) == 0) {
445
446 return stat;
447 }
448 } catch(KeeperException keeperException){
449
450 throw keeperException;
451 }
452 }
453
454 default:
455 throw e;
456 }
457 }
458 retryCounter.sleepUntilNextRetry();
459 isRetry = true;
460 }
461 } finally {
462 if (traceScope != null) traceScope.close();
463 }
464 }
465
466
467
468
469
470 public List<ACL> getAcl(String path, Stat stat)
471 throws KeeperException, InterruptedException {
472 TraceScope traceScope = null;
473 try {
474 traceScope = Trace.startSpan("RecoverableZookeeper.getAcl");
475 RetryCounter retryCounter = retryCounterFactory.create();
476 while (true) {
477 try {
478 return checkZk().getACL(path, stat);
479 } catch (KeeperException e) {
480 switch (e.code()) {
481 case CONNECTIONLOSS:
482 case SESSIONEXPIRED:
483 case OPERATIONTIMEOUT:
484 retryOrThrow(retryCounter, e, "getAcl");
485 break;
486
487 default:
488 throw e;
489 }
490 }
491 retryCounter.sleepUntilNextRetry();
492 }
493 } finally {
494 if (traceScope != null) traceScope.close();
495 }
496 }
497
498
499
500
501
502 public Stat setAcl(String path, List<ACL> acls, int version)
503 throws KeeperException, InterruptedException {
504 TraceScope traceScope = null;
505 try {
506 traceScope = Trace.startSpan("RecoverableZookeeper.setAcl");
507 RetryCounter retryCounter = retryCounterFactory.create();
508 while (true) {
509 try {
510 return checkZk().setACL(path, acls, version);
511 } catch (KeeperException e) {
512 switch (e.code()) {
513 case CONNECTIONLOSS:
514 case SESSIONEXPIRED:
515 case OPERATIONTIMEOUT:
516 retryOrThrow(retryCounter, e, "setAcl");
517 break;
518
519 default:
520 throw e;
521 }
522 }
523 retryCounter.sleepUntilNextRetry();
524 }
525 } finally {
526 if (traceScope != null) traceScope.close();
527 }
528 }
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545 public String create(String path, byte[] data, List<ACL> acl,
546 CreateMode createMode)
547 throws KeeperException, InterruptedException {
548 TraceScope traceScope = null;
549 try {
550 traceScope = Trace.startSpan("RecoverableZookeeper.create");
551 byte[] newData = appendMetaData(data);
552 switch (createMode) {
553 case EPHEMERAL:
554 case PERSISTENT:
555 return createNonSequential(path, newData, acl, createMode);
556
557 case EPHEMERAL_SEQUENTIAL:
558 case PERSISTENT_SEQUENTIAL:
559 return createSequential(path, newData, acl, createMode);
560
561 default:
562 throw new IllegalArgumentException("Unrecognized CreateMode: " +
563 createMode);
564 }
565 } finally {
566 if (traceScope != null) traceScope.close();
567 }
568 }
569
570 private String createNonSequential(String path, byte[] data, List<ACL> acl,
571 CreateMode createMode) throws KeeperException, InterruptedException {
572 RetryCounter retryCounter = retryCounterFactory.create();
573 boolean isRetry = false;
574 while (true) {
575 try {
576 return checkZk().create(path, data, acl, createMode);
577 } catch (KeeperException e) {
578 switch (e.code()) {
579 case NODEEXISTS:
580 if (isRetry) {
581
582
583
584 byte[] currentData = checkZk().getData(path, false, null);
585 if (currentData != null &&
586 Bytes.compareTo(currentData, data) == 0) {
587
588 return path;
589 }
590 LOG.error("Node " + path + " already exists with " +
591 Bytes.toStringBinary(currentData) + ", could not write " +
592 Bytes.toStringBinary(data));
593 throw e;
594 }
595 LOG.info("Node " + path + " already exists and this is not a " +
596 "retry");
597 throw e;
598
599 case CONNECTIONLOSS:
600 case SESSIONEXPIRED:
601 case OPERATIONTIMEOUT:
602 retryOrThrow(retryCounter, e, "create");
603 break;
604
605 default:
606 throw e;
607 }
608 }
609 retryCounter.sleepUntilNextRetry();
610 isRetry = true;
611 }
612 }
613
614 private String createSequential(String path, byte[] data,
615 List<ACL> acl, CreateMode createMode)
616 throws KeeperException, InterruptedException {
617 RetryCounter retryCounter = retryCounterFactory.create();
618 boolean first = true;
619 String newPath = path+this.identifier;
620 while (true) {
621 try {
622 if (!first) {
623
624 String previousResult = findPreviousSequentialNode(newPath);
625 if (previousResult != null) {
626 return previousResult;
627 }
628 }
629 first = false;
630 return checkZk().create(newPath, data, acl, createMode);
631 } catch (KeeperException e) {
632 switch (e.code()) {
633 case CONNECTIONLOSS:
634 case SESSIONEXPIRED:
635 case OPERATIONTIMEOUT:
636 retryOrThrow(retryCounter, e, "create");
637 break;
638
639 default:
640 throw e;
641 }
642 }
643 retryCounter.sleepUntilNextRetry();
644 }
645 }
646
647
648
649
650 private Iterable<Op> prepareZKMulti(Iterable<Op> ops)
651 throws UnsupportedOperationException {
652 if(ops == null) return null;
653
654 List<Op> preparedOps = new LinkedList<Op>();
655 for (Op op : ops) {
656 if (op.getType() == ZooDefs.OpCode.create) {
657 CreateRequest create = (CreateRequest)op.toRequestRecord();
658 preparedOps.add(Op.create(create.getPath(), appendMetaData(create.getData()),
659 create.getAcl(), create.getFlags()));
660 } else if (op.getType() == ZooDefs.OpCode.delete) {
661
662 preparedOps.add(op);
663 } else if (op.getType() == ZooDefs.OpCode.setData) {
664 SetDataRequest setData = (SetDataRequest)op.toRequestRecord();
665 preparedOps.add(Op.setData(setData.getPath(), appendMetaData(setData.getData()),
666 setData.getVersion()));
667 } else {
668 throw new UnsupportedOperationException("Unexpected ZKOp type: " + op.getClass().getName());
669 }
670 }
671 return preparedOps;
672 }
673
674
675
676
677 public List<OpResult> multi(Iterable<Op> ops)
678 throws KeeperException, InterruptedException {
679 TraceScope traceScope = null;
680 try {
681 traceScope = Trace.startSpan("RecoverableZookeeper.multi");
682 RetryCounter retryCounter = retryCounterFactory.create();
683 Iterable<Op> multiOps = prepareZKMulti(ops);
684 while (true) {
685 try {
686 return checkZk().multi(multiOps);
687 } catch (KeeperException e) {
688 switch (e.code()) {
689 case CONNECTIONLOSS:
690 case SESSIONEXPIRED:
691 case OPERATIONTIMEOUT:
692 retryOrThrow(retryCounter, e, "multi");
693 break;
694
695 default:
696 throw e;
697 }
698 }
699 retryCounter.sleepUntilNextRetry();
700 }
701 } finally {
702 if (traceScope != null) traceScope.close();
703 }
704 }
705
706 private String findPreviousSequentialNode(String path)
707 throws KeeperException, InterruptedException {
708 int lastSlashIdx = path.lastIndexOf('/');
709 assert(lastSlashIdx != -1);
710 String parent = path.substring(0, lastSlashIdx);
711 String nodePrefix = path.substring(lastSlashIdx+1);
712
713 List<String> nodes = checkZk().getChildren(parent, false);
714 List<String> matching = filterByPrefix(nodes, nodePrefix);
715 for (String node : matching) {
716 String nodePath = parent + "/" + node;
717 Stat stat = checkZk().exists(nodePath, false);
718 if (stat != null) {
719 return nodePath;
720 }
721 }
722 return null;
723 }
724
725 public byte[] removeMetaData(byte[] data) {
726 if(data == null || data.length == 0) {
727 return data;
728 }
729
730 byte magic = data[0];
731 if(magic != MAGIC) {
732 return data;
733 }
734
735 int idLength = Bytes.toInt(data, ID_LENGTH_OFFSET);
736 int dataLength = data.length-MAGIC_SIZE-ID_LENGTH_SIZE-idLength;
737 int dataOffset = MAGIC_SIZE+ID_LENGTH_SIZE+idLength;
738
739 byte[] newData = new byte[dataLength];
740 System.arraycopy(data, dataOffset, newData, 0, dataLength);
741 return newData;
742 }
743
744 private byte[] appendMetaData(byte[] data) {
745 if(data == null || data.length == 0){
746 return data;
747 }
748 byte[] salt = Bytes.toBytes(salter.nextLong());
749 int idLength = id.length + salt.length;
750 byte[] newData = new byte[MAGIC_SIZE+ID_LENGTH_SIZE+idLength+data.length];
751 int pos = 0;
752 pos = Bytes.putByte(newData, pos, MAGIC);
753 pos = Bytes.putInt(newData, pos, idLength);
754 pos = Bytes.putBytes(newData, pos, id, 0, id.length);
755 pos = Bytes.putBytes(newData, pos, salt, 0, salt.length);
756 pos = Bytes.putBytes(newData, pos, data, 0, data.length);
757 return newData;
758 }
759
760 public synchronized long getSessionId() {
761 return zk == null ? null : zk.getSessionId();
762 }
763
764 public synchronized void close() throws InterruptedException {
765 if (zk != null) zk.close();
766 }
767
768 public synchronized States getState() {
769 return zk == null ? null : zk.getState();
770 }
771
772 public synchronized ZooKeeper getZooKeeper() {
773 return zk;
774 }
775
776 public synchronized byte[] getSessionPasswd() {
777 return zk == null ? null : zk.getSessionPasswd();
778 }
779
780 public void sync(String path, AsyncCallback.VoidCallback cb, Object ctx) throws KeeperException {
781 checkZk().sync(path, null, null);
782 }
783
784
785
786
787
788
789
790
791
792
793 private static List<String> filterByPrefix(List<String> nodes,
794 String... prefixes) {
795 List<String> lockChildren = new ArrayList<String>();
796 for (String child : nodes){
797 for (String prefix : prefixes){
798 if (child.startsWith(prefix)){
799 lockChildren.add(child);
800 break;
801 }
802 }
803 }
804 return lockChildren;
805 }
806
807 public String getIdentifier() {
808 return identifier;
809 }
810 }