1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.util;
20
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.Collection;
25 import java.util.HashSet;
26 import java.util.LinkedList;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Queue;
30 import java.util.Random;
31 import java.util.Set;
32
33 import org.apache.commons.cli.CommandLine;
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.apache.hadoop.conf.Configuration;
37 import org.apache.hadoop.hbase.ClusterStatus;
38 import org.apache.hadoop.hbase.HBaseCluster;
39 import org.apache.hadoop.hbase.HBaseConfiguration;
40 import org.apache.hadoop.hbase.HRegionInfo;
41 import org.apache.hadoop.hbase.HServerLoad;
42 import org.apache.hadoop.hbase.IntegrationTestingUtility;
43 import org.apache.hadoop.hbase.IntegrationTestDataIngestWithChaosMonkey;
44 import org.apache.hadoop.hbase.ServerName;
45 import org.apache.hadoop.hbase.Stoppable;
46 import org.apache.hadoop.hbase.client.HBaseAdmin;
47 import org.apache.hadoop.util.StringUtils;
48 import org.apache.hadoop.util.ToolRunner;
49
50 import com.google.common.collect.Lists;
51 import com.google.common.collect.Maps;
52 import com.google.protobuf.ServiceException;
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75 public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
76
77 private static final Log LOG = LogFactory.getLog(ChaosMonkey.class);
78
79 private static final long ONE_SEC = 1000;
80 private static final long FIVE_SEC = 5 * ONE_SEC;
81 private static final long ONE_MIN = 60 * ONE_SEC;
82 private static final long TIMEOUT = ONE_MIN;
83
84 final IntegrationTestingUtility util;
85
86
87
88
89
90
91 public ChaosMonkey(IntegrationTestingUtility util, String... policies) {
92 this.util = util;
93 setPoliciesByName(policies);
94 }
95
96
97
98
99
100
101 public ChaosMonkey(IntegrationTestingUtility util, Policy... policies) {
102 this.util = util;
103 this.policies = policies;
104 }
105
106 private void setPoliciesByName(String... policies) {
107 this.policies = new Policy[policies.length];
108 for (int i=0; i < policies.length; i++) {
109 this.policies[i] = NAMED_POLICIES.get(policies[i]);
110 }
111 }
112
113
114
115
116 private static class ActionContext {
117 private IntegrationTestingUtility util;
118
119 ActionContext(IntegrationTestingUtility util) {
120 this.util = util;
121 }
122
123 IntegrationTestingUtility getHaseIntegrationTestingUtility() {
124 return util;
125 }
126
127 HBaseCluster getHBaseCluster() {
128 return util.getHBaseClusterInterface();
129 }
130 }
131
132
133
134
135 public static class Action {
136
137
138
139
140
141 protected ActionContext context;
142 protected HBaseCluster cluster;
143 protected ClusterStatus initialStatus;
144 protected ServerName[] initialServers;
145
146 void init(ActionContext context) throws Exception {
147 this.context = context;
148 cluster = context.getHBaseCluster();
149 initialStatus = cluster.getInitialClusterStatus();
150 Collection<ServerName> regionServers = initialStatus.getServers();
151 initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
152 }
153
154 void perform() throws Exception { };
155
156
157
158 protected ServerName[] getCurrentServers() throws IOException {
159 Collection<ServerName> regionServers = cluster.getClusterStatus().getServers();
160 return regionServers.toArray(new ServerName[regionServers.size()]);
161 }
162
163 protected void killMaster(ServerName server) throws IOException {
164 LOG.info("Killing master:" + server);
165 cluster.killMaster(server);
166 cluster.waitForMasterToStop(server, TIMEOUT);
167 LOG.info("Killed master server:" + server);
168 }
169
170 protected void startMaster(ServerName server) throws IOException {
171 LOG.info("Starting master:" + server.getHostname());
172 cluster.startMaster(server.getHostname());
173 cluster.waitForActiveAndReadyMaster(TIMEOUT);
174 LOG.info("Started master: " + server);
175 }
176
177 protected void killRs(ServerName server) throws IOException {
178 LOG.info("Killing region server:" + server);
179 cluster.killRegionServer(server);
180 cluster.waitForRegionServerToStop(server, TIMEOUT);
181 LOG.info("Killed region server:" + server + ". Reported num of rs:"
182 + cluster.getClusterStatus().getServersSize());
183 }
184
185 protected void startRs(ServerName server) throws IOException {
186 LOG.info("Starting region server:" + server.getHostname());
187 cluster.startRegionServer(server.getHostname());
188 cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT);
189 LOG.info("Started region server:" + server + ". Reported num of rs:"
190 + cluster.getClusterStatus().getServersSize());
191 }
192 }
193
194 private static class RestartActionBase extends Action {
195 long sleepTime;
196
197 public RestartActionBase(long sleepTime) {
198 this.sleepTime = sleepTime;
199 }
200
201 void sleep(long sleepTime) {
202 LOG.info("Sleeping for:" + sleepTime);
203 Threads.sleep(sleepTime);
204 }
205
206 void restartMaster(ServerName server, long sleepTime) throws IOException {
207 killMaster(server);
208 sleep(sleepTime);
209 startMaster(server);
210 }
211
212 void restartRs(ServerName server, long sleepTime) throws IOException {
213 killRs(server);
214 sleep(sleepTime);
215 startRs(server);
216 }
217 }
218
219 public static class RestartActiveMaster extends RestartActionBase {
220 public RestartActiveMaster(long sleepTime) {
221 super(sleepTime);
222 }
223 @Override
224 void perform() throws Exception {
225 LOG.info("Performing action: Restart active master");
226
227 ServerName master = cluster.getClusterStatus().getMaster();
228 restartMaster(master, sleepTime);
229 }
230 }
231
232 public static class RestartRandomRs extends RestartActionBase {
233 public RestartRandomRs(long sleepTime) {
234 super(sleepTime);
235 }
236
237 @Override
238 void perform() throws Exception {
239 LOG.info("Performing action: Restart random region server");
240 ServerName server = selectRandomItem(getCurrentServers());
241
242 restartRs(server, sleepTime);
243 }
244 }
245
246 public static class RestartRsHoldingMeta extends RestartRandomRs {
247 public RestartRsHoldingMeta(long sleepTime) {
248 super(sleepTime);
249 }
250 @Override
251 void perform() throws Exception {
252 LOG.info("Performing action: Restart region server holding META");
253 ServerName server = cluster.getServerHoldingMeta();
254 if (server == null) {
255 LOG.warn("No server is holding .META. right now.");
256 return;
257 }
258 restartRs(server, sleepTime);
259 }
260 }
261
262 public static class RestartRsHoldingRoot extends RestartRandomRs {
263 public RestartRsHoldingRoot(long sleepTime) {
264 super(sleepTime);
265 }
266 @Override
267 void perform() throws Exception {
268 LOG.info("Performing action: Restart region server holding ROOT");
269 ServerName server = cluster.getServerHoldingMeta();
270 if (server == null) {
271 LOG.warn("No server is holding -ROOT- right now.");
272 return;
273 }
274 restartRs(server, sleepTime);
275 }
276 }
277
278
279
280
281 public static class BatchRestartRs extends RestartActionBase {
282 float ratio;
283
284 public BatchRestartRs(long sleepTime, float ratio) {
285 super(sleepTime);
286 this.ratio = ratio;
287 }
288
289 @Override
290 void perform() throws Exception {
291 LOG.info(String.format("Performing action: Batch restarting %d%% of region servers",
292 (int)(ratio * 100)));
293 List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
294
295 for (ServerName server : selectedServers) {
296 LOG.info("Killing region server:" + server);
297 cluster.killRegionServer(server);
298 }
299
300 for (ServerName server : selectedServers) {
301 cluster.waitForRegionServerToStop(server, TIMEOUT);
302 }
303
304 LOG.info("Killed " + selectedServers.size() + " region servers. Reported num of rs:"
305 + cluster.getClusterStatus().getServersSize());
306
307 sleep(sleepTime);
308
309 for (ServerName server : selectedServers) {
310 LOG.info("Starting region server:" + server.getHostname());
311 cluster.startRegionServer(server.getHostname());
312
313 }
314 for (ServerName server : selectedServers) {
315 cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT);
316 }
317 LOG.info("Started " + selectedServers.size() +" region servers. Reported num of rs:"
318 + cluster.getClusterStatus().getServersSize());
319 }
320 }
321
322
323
324
325
326 public static class RollingBatchRestartRs extends BatchRestartRs {
327 public RollingBatchRestartRs(long sleepTime, float ratio) {
328 super(sleepTime, ratio);
329 }
330
331 @Override
332 void perform() throws Exception {
333 LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
334 (int)(ratio * 100)));
335 Random random = new Random();
336 List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
337
338 Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
339 Queue<ServerName> deadServers = new LinkedList<ServerName>();
340
341
342 while (!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) {
343 boolean action = true;
344
345 if (serversToBeKilled.isEmpty() || deadServers.isEmpty()) {
346 action = deadServers.isEmpty();
347 } else {
348 action = random.nextBoolean();
349 }
350
351 if (action) {
352 ServerName server = serversToBeKilled.remove();
353 killRs(server);
354 deadServers.add(server);
355 } else {
356 ServerName server = deadServers.remove();
357 startRs(server);
358 }
359
360 sleep(random.nextInt((int)sleepTime));
361 }
362 }
363 }
364
365 public static class UnbalanceRegionsAction extends Action {
366 private double fractionOfRegions;
367 private double fractionOfServers;
368 private Random random = new Random();
369
370
371
372
373
374
375
376 public UnbalanceRegionsAction(double fractionOfRegions, double fractionOfServers) {
377 this.fractionOfRegions = fractionOfRegions;
378 this.fractionOfServers = fractionOfServers;
379 }
380
381 @Override
382 void perform() throws Exception {
383 LOG.info("Unbalancing regions");
384 ClusterStatus status = this.cluster.getClusterStatus();
385 List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
386 int targetServerCount = (int)Math.ceil(fractionOfServers * victimServers.size());
387 List<byte[]> targetServers = new ArrayList<byte[]>(targetServerCount);
388 for (int i = 0; i < targetServerCount; ++i) {
389 int victimIx = random.nextInt(victimServers.size());
390 String serverName = victimServers.remove(victimIx).getServerName();
391 targetServers.add(Bytes.toBytes(serverName));
392 }
393
394 List<byte[]> victimRegions = new LinkedList<byte[]>();
395 for (ServerName server : victimServers) {
396 HServerLoad serverLoad = status.getLoad(server);
397
398 List<byte[]> regions = new LinkedList<byte[]>(serverLoad.getRegionsLoad().keySet());
399 int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
400 LOG.debug("Removing " + victimRegionCount + " regions from " + server.getServerName());
401 for (int i = 0; i < victimRegionCount; ++i) {
402 int victimIx = random.nextInt(regions.size());
403 String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
404 victimRegions.add(Bytes.toBytes(regionId));
405 }
406 }
407
408 LOG.info("Moving " + victimRegions.size() + " regions from " + victimServers.size()
409 + " servers to " + targetServers.size() + " different servers");
410 HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin();
411 for (byte[] victimRegion : victimRegions) {
412 int targetIx = random.nextInt(targetServers.size());
413 admin.move(victimRegion, targetServers.get(targetIx));
414 }
415 }
416 }
417
418 public static class ForceBalancerAction extends Action {
419 @Override
420 void perform() throws Exception {
421 LOG.info("Balancing regions");
422 HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin();
423 boolean result = admin.balancer();
424 if (!result) {
425 LOG.error("Balancer didn't succeed");
426 }
427 }
428 }
429
430
431
432
433 private static class PolicyContext extends ActionContext {
434 PolicyContext(IntegrationTestingUtility util) {
435 super(util);
436 }
437 }
438
439
440
441
442 public static abstract class Policy extends StoppableImplementation implements Runnable {
443 PolicyContext context;
444 public void init(PolicyContext context) throws Exception {
445 this.context = context;
446 }
447 }
448
449
450 public static class CompositeSequentialPolicy extends Policy {
451 private List<Policy> policies;
452 public CompositeSequentialPolicy(Policy... policies) {
453 this.policies = Arrays.asList(policies);
454 }
455
456 @Override
457 public void stop(String why) {
458 super.stop(why);
459 for (Policy p : policies) {
460 p.stop(why);
461 }
462 }
463
464 @Override
465 public void run() {
466 for (Policy p : policies) {
467 p.run();
468 }
469 }
470
471 @Override
472 public void init(PolicyContext context) throws Exception {
473 super.init(context);
474 for (Policy p : policies) {
475 p.init(context);
476 }
477 }
478 }
479
480
481 public static abstract class PeriodicPolicy extends Policy {
482 private long periodMs;
483
484 public PeriodicPolicy(long periodMs) {
485 this.periodMs = periodMs;
486 }
487
488 @Override
489 public void run() {
490
491 int jitter = new Random().nextInt((int)periodMs);
492 LOG.info("Sleeping for " + jitter + " to add jitter");
493 Threads.sleep(jitter);
494
495 while (!isStopped()) {
496 long start = System.currentTimeMillis();
497 runOneIteration();
498
499 if (isStopped()) return;
500 long sleepTime = periodMs - (System.currentTimeMillis() - start);
501 if (sleepTime > 0) {
502 LOG.info("Sleeping for: " + sleepTime);
503 Threads.sleep(sleepTime);
504 }
505 }
506 }
507
508 protected abstract void runOneIteration();
509
510 @Override
511 public void init(PolicyContext context) throws Exception {
512 super.init(context);
513 LOG.info("Using ChaosMonkey Policy: " + this.getClass() + ", period: " + periodMs);
514 }
515 }
516
517
518
519 public static class DoActionsOncePolicy extends PeriodicPolicy {
520 private List<Action> actions;
521
522 public DoActionsOncePolicy(long periodMs, List<Action> actions) {
523 super(periodMs);
524 this.actions = new ArrayList<ChaosMonkey.Action>(actions);
525 }
526
527 public DoActionsOncePolicy(long periodMs, Action... actions) {
528 this(periodMs, Arrays.asList(actions));
529 }
530
531 @Override
532 protected void runOneIteration() {
533 if (actions.isEmpty()) {
534 this.stop("done");
535 return;
536 }
537 Action action = actions.remove(0);
538
539 try {
540 action.perform();
541 } catch (Exception ex) {
542 LOG.warn("Exception occured during performing action: "
543 + StringUtils.stringifyException(ex));
544 }
545 }
546
547 @Override
548 public void init(PolicyContext context) throws Exception {
549 super.init(context);
550 for (Action action : actions) {
551 action.init(this.context);
552 }
553 }
554 }
555
556
557
558
559
560 public static class PeriodicRandomActionPolicy extends PeriodicPolicy {
561 private List<Pair<Action, Integer>> actions;
562
563 public PeriodicRandomActionPolicy(long periodMs, List<Pair<Action, Integer>> actions) {
564 super(periodMs);
565 this.actions = actions;
566 }
567
568 public PeriodicRandomActionPolicy(long periodMs, Pair<Action, Integer>... actions) {
569
570 this(periodMs, Arrays.asList(actions));
571 }
572
573 public PeriodicRandomActionPolicy(long periodMs, Action... actions) {
574 super(periodMs);
575 this.actions = new ArrayList<Pair<Action, Integer>>(actions.length);
576 for (Action action : actions) {
577 this.actions.add(new Pair<Action, Integer>(action, 1));
578 }
579 }
580
581 @Override
582 protected void runOneIteration() {
583 Action action = selectWeightedRandomItem(actions);
584 try {
585 action.perform();
586 } catch (Exception ex) {
587 LOG.warn("Exception occured during performing action: "
588 + StringUtils.stringifyException(ex));
589 }
590 }
591
592 @Override
593 public void init(PolicyContext context) throws Exception {
594 super.init(context);
595 for (Pair<Action, Integer> action : actions) {
596 action.getFirst().init(this.context);
597 }
598 }
599 }
600
601
602 static <T> T selectRandomItem(T[] items) {
603 Random random = new Random();
604 return items[random.nextInt(items.length)];
605 }
606
607
608 static <T> T selectWeightedRandomItem(List<Pair<T, Integer>> items) {
609 Random random = new Random();
610 int totalWeight = 0;
611 for (Pair<T, Integer> pair : items) {
612 totalWeight += pair.getSecond();
613 }
614
615 int cutoff = random.nextInt(totalWeight);
616 int cummulative = 0;
617 T item = null;
618
619
620 for (int i=0; i<items.size(); i++) {
621 int curWeight = items.get(i).getSecond();
622 if ( cutoff < cummulative + curWeight) {
623 item = items.get(i).getFirst();
624 break;
625 }
626 cummulative += curWeight;
627 }
628
629 return item;
630 }
631
632
633 static <T> List<T> selectRandomItems(T[] items, float ratio) {
634 Random random = new Random();
635 int remaining = (int)Math.ceil(items.length * ratio);
636
637 List<T> selectedItems = new ArrayList<T>(remaining);
638
639 for (int i=0; i<items.length && remaining > 0; i++) {
640 if (random.nextFloat() < ((float)remaining/(items.length-i))) {
641 selectedItems.add(items[i]);
642 remaining--;
643 }
644 }
645
646 return selectedItems;
647 }
648
649
650
651
652
653
654
655
656
657
658
659 @SuppressWarnings("unchecked")
660 private static final List<Pair<Action, Integer>> ALL_ACTIONS = Lists.newArrayList(
661 new Pair<Action,Integer>(new RestartActiveMaster(FIVE_SEC), 2),
662 new Pair<Action,Integer>(new RestartRandomRs(FIVE_SEC), 2),
663 new Pair<Action,Integer>(new RestartRandomRs(ONE_MIN), 2),
664 new Pair<Action,Integer>(new RestartRsHoldingMeta(FIVE_SEC), 1),
665 new Pair<Action,Integer>(new RestartRsHoldingRoot(FIVE_SEC), 1),
666 new Pair<Action,Integer>(new BatchRestartRs(FIVE_SEC, 0.5f), 2),
667 new Pair<Action,Integer>(new RollingBatchRestartRs(FIVE_SEC, 1.0f), 2)
668 );
669
670 public static final String EVERY_MINUTE_RANDOM_ACTION_POLICY = "EVERY_MINUTE_RANDOM_ACTION_POLICY";
671
672 private Policy[] policies;
673 private Thread[] monkeyThreads;
674
675 public void start() throws Exception {
676 monkeyThreads = new Thread[policies.length];
677
678 for (int i=0; i<policies.length; i++) {
679 policies[i].init(new PolicyContext(this.util));
680 Thread monkeyThread = new Thread(policies[i]);
681 monkeyThread.start();
682 monkeyThreads[i] = monkeyThread;
683 }
684 }
685
686 @Override
687 public void stop(String why) {
688 for (Policy policy : policies) {
689 policy.stop(why);
690 }
691 }
692
693 @Override
694 public boolean isStopped() {
695 return policies[0].isStopped();
696 }
697
698
699
700
701
702 public void waitForStop() throws InterruptedException {
703 for (Thread monkeyThread : monkeyThreads) {
704 monkeyThread.join();
705 }
706 }
707
708 private static final Map<String, Policy> NAMED_POLICIES = Maps.newHashMap();
709 static {
710 NAMED_POLICIES.put(EVERY_MINUTE_RANDOM_ACTION_POLICY,
711 new PeriodicRandomActionPolicy(ONE_MIN, ALL_ACTIONS));
712 }
713
714 @Override
715 protected void addOptions() {
716 addOptWithArg("policy", "a named policy defined in ChaosMonkey.java. Possible values: "
717 + NAMED_POLICIES.keySet());
718
719 }
720
721 @Override
722 protected void processOptions(CommandLine cmd) {
723 String[] policies = cmd.getOptionValues("policy");
724 if (policies != null) {
725 setPoliciesByName(policies);
726 }
727 }
728
729 @Override
730 protected int doWork() throws Exception {
731 start();
732 waitForStop();
733 return 0;
734 }
735
736 public static void main(String[] args) throws Exception {
737 Configuration conf = HBaseConfiguration.create();
738 IntegrationTestingUtility.setUseDistributedCluster(conf);
739 IntegrationTestingUtility util = new IntegrationTestingUtility(conf);
740 util.initializeCluster(1);
741
742 ChaosMonkey monkey = new ChaosMonkey(util, EVERY_MINUTE_RANDOM_ACTION_POLICY);
743 int ret = ToolRunner.run(conf, monkey, args);
744 System.exit(ret);
745 }
746
747 }