1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.util;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collection;
25  import java.util.LinkedList;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Queue;
29  import java.util.Random;
30  
31  import org.apache.commons.cli.CommandLine;
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.conf.Configuration;
35  import org.apache.hadoop.hbase.ClusterStatus;
36  import org.apache.hadoop.hbase.HBaseCluster;
37  import org.apache.hadoop.hbase.HBaseConfiguration;
38  import org.apache.hadoop.hbase.HRegionInfo;
39  import org.apache.hadoop.hbase.HServerLoad;
40  import org.apache.hadoop.hbase.IntegrationTestingUtility;
41  import org.apache.hadoop.hbase.IntegrationTestDataIngestWithChaosMonkey;
42  import org.apache.hadoop.hbase.ServerName;
43  import org.apache.hadoop.hbase.Stoppable;
44  import org.apache.hadoop.hbase.client.HBaseAdmin;
45  import org.apache.hadoop.hbase.client.HTable;
46  import org.apache.hadoop.util.StringUtils;
47  import org.apache.hadoop.util.ToolRunner;
48  
49  import com.google.common.collect.Lists;
50  import com.google.common.collect.Maps;
51  
52  /**
53   * A utility to injects faults in a running cluster.
54   * <p>
55   * ChaosMonkey defines Action's and Policy's. Actions are sequences of events, like
56   *  - Select a random server to kill
57   *  - Sleep for 5 sec
58   *  - Start the server on the same host
59   * Actions can also be complex events, like rolling restart of all of the servers.
60   * <p>
61   * Policies on the other hand are responsible for executing the actions based on a strategy.
62   * The default policy is to execute a random action every minute based on predefined action
63   * weights. ChaosMonkey executes predefined named policies until it is stopped. More than one
64   * policy can be active at any time.
65   * <p>
66   * Chaos monkey can be run from the command line, or can be invoked from integration tests.
67   * See {@link IntegrationTestDataIngestWithChaosMonkey} or other integration tests that use
68   * chaos monkey for code examples.
69   * <p>
70   * ChaosMonkey class is indeed inspired by the Netflix's same-named tool:
71   * http://techblog.netflix.com/2012/07/chaos-monkey-released-into-wild.html
72   */
73  public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
74  
75    private static final Log LOG = LogFactory.getLog(ChaosMonkey.class);
76  
77    private static final long ONE_SEC = 1000;
78    private static final long FIVE_SEC = 5 * ONE_SEC;
79    private static final long ONE_MIN = 60 * ONE_SEC;
80    private static final long TIMEOUT = ONE_MIN;
81  
82    final IntegrationTestingUtility util;
83  
84    /**
85     * Construct a new ChaosMonkey
86     * @param util the HBaseIntegrationTestingUtility already configured
87     * @param policies names of pre-defined policies to use
88     */
89    public ChaosMonkey(IntegrationTestingUtility util, String... policies) {
90      this.util = util;
91      setPoliciesByName(policies);
92    }
93  
94    /**
95     * Construct a new ChaosMonkey
96     * @param util the HBaseIntegrationTestingUtility already configured
97     * @param policies custom policies to use
98     */
99    public ChaosMonkey(IntegrationTestingUtility util, Policy... policies) {
100     this.util = util;
101     this.policies = policies;
102   }
103 
104   private void setPoliciesByName(String... policies) {
105     this.policies = new Policy[policies.length];
106     for (int i=0; i < policies.length; i++) {
107       this.policies[i] = NAMED_POLICIES.get(policies[i]);
108     }
109   }
110 
111   /**
112    * Context for Action's
113    */
114   public static class ActionContext {
115     private IntegrationTestingUtility util;
116 
117     public ActionContext(IntegrationTestingUtility util) {
118       this.util = util;
119     }
120 
121     public IntegrationTestingUtility getHaseIntegrationTestingUtility() {
122       return util;
123     }
124 
125     public HBaseCluster getHBaseCluster() {
126       return util.getHBaseClusterInterface();
127     }
128   }
129 
130   /**
131    * A (possibly mischievous) action that the ChaosMonkey can perform.
132    */
133   public static class Action {
134     // TODO: interesting question - should actions be implemented inside
135     //       ChaosMonkey, or outside? If they are inside (initial), the class becomes
136     //       huge and all-encompassing; if they are outside ChaosMonkey becomes just
137     //       a random task scheduler. For now, keep inside.
138 
139     protected ActionContext context;
140     protected HBaseCluster cluster;
141     protected ClusterStatus initialStatus;
142     protected ServerName[] initialServers;
143 
144     public void init(ActionContext context) throws IOException {
145       this.context = context;
146       cluster = context.getHBaseCluster();
147       initialStatus = cluster.getInitialClusterStatus();
148       Collection<ServerName> regionServers = initialStatus.getServers();
149       initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
150     }
151 
152     public void perform() throws Exception { };
153 
154     // TODO: perhaps these methods should be elsewhere?
155     /** Returns current region servers */
156     protected ServerName[] getCurrentServers() throws IOException {
157       Collection<ServerName> regionServers = cluster.getClusterStatus().getServers();
158       return regionServers.toArray(new ServerName[regionServers.size()]);
159     }
160 
161     protected void killMaster(ServerName server) throws IOException {
162       LOG.info("Killing master:" + server);
163       cluster.killMaster(server);
164       cluster.waitForMasterToStop(server, TIMEOUT);
165       LOG.info("Killed master server:" + server);
166     }
167 
168     protected void startMaster(ServerName server) throws IOException {
169       LOG.info("Starting master:" + server.getHostname());
170       cluster.startMaster(server.getHostname());
171       cluster.waitForActiveAndReadyMaster(TIMEOUT);
172       LOG.info("Started master: " + server);
173     }
174 
175     protected void killRs(ServerName server) throws IOException {
176       LOG.info("Killing region server:" + server);
177       cluster.killRegionServer(server);
178       cluster.waitForRegionServerToStop(server, TIMEOUT);
179       LOG.info("Killed region server:" + server + ". Reported num of rs:"
180           + cluster.getClusterStatus().getServersSize());
181     }
182 
183     protected void startRs(ServerName server) throws IOException {
184       LOG.info("Starting region server:" + server.getHostname());
185       cluster.startRegionServer(server.getHostname());
186       cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT);
187       LOG.info("Started region server:" + server + ". Reported num of rs:"
188           + cluster.getClusterStatus().getServersSize());
189     }
190   }
191 
192   private static class RestartActionBase extends Action {
193     long sleepTime; // how long should we sleep
194 
195     public RestartActionBase(long sleepTime) {
196       this.sleepTime = sleepTime;
197     }
198 
199     void sleep(long sleepTime) {
200       LOG.info("Sleeping for:" + sleepTime);
201       Threads.sleep(sleepTime);
202     }
203 
204     void restartMaster(ServerName server, long sleepTime) throws IOException {
205       sleepTime = Math.max(sleepTime, 1000);
206       killMaster(server);
207       sleep(sleepTime);
208       startMaster(server);
209     }
210 
211     void restartRs(ServerName server, long sleepTime) throws IOException {
212       sleepTime = Math.max(sleepTime, 1000);
213       killRs(server);
214       sleep(sleepTime);
215       startRs(server);
216     }
217   }
218 
219   public static class RestartActiveMaster extends RestartActionBase {
220     public RestartActiveMaster(long sleepTime) {
221       super(sleepTime);
222     }
223     @Override
224     public void perform() throws Exception {
225       LOG.info("Performing action: Restart active master");
226 
227       ServerName master = cluster.getClusterStatus().getMaster();
228       restartMaster(master, sleepTime);
229     }
230   }
231 
232   public static class RestartRandomRs extends RestartActionBase {
233     public RestartRandomRs(long sleepTime) {
234       super(sleepTime);
235     }
236 
237     @Override
238     public void perform() throws Exception {
239       LOG.info("Performing action: Restart random region server");
240       ServerName server = selectRandomItem(getCurrentServers());
241 
242       restartRs(server, sleepTime);
243     }
244   }
245 
246   public static class RestartRsHoldingMeta extends RestartActionBase {
247     public RestartRsHoldingMeta(long sleepTime) {
248       super(sleepTime);
249     }
250     @Override
251     public void perform() throws Exception {
252       LOG.info("Performing action: Restart region server holding META");
253       ServerName server = cluster.getServerHoldingMeta();
254       if (server == null) {
255         LOG.warn("No server is holding .META. right now.");
256         return;
257       }
258       restartRs(server, sleepTime);
259     }
260   }
261 
262   public static class RestartRsHoldingRoot extends RestartRandomRs {
263     public RestartRsHoldingRoot(long sleepTime) {
264       super(sleepTime);
265     }
266     @Override
267     public void perform() throws Exception {
268       LOG.info("Performing action: Restart region server holding ROOT");
269       ServerName server = cluster.getServerHoldingMeta();
270       if (server == null) {
271         LOG.warn("No server is holding -ROOT- right now.");
272         return;
273       }
274       restartRs(server, sleepTime);
275     }
276   }
277 
278   public static class RestartRsHoldingTable extends RestartActionBase {
279 
280     private final String tableName;
281 
282     public RestartRsHoldingTable(long sleepTime, String tableName) {
283       super(sleepTime);
284       this.tableName = tableName;
285     }
286 
287     @Override
288     public void perform() throws Exception {
289       HTable table = null;
290       Collection<ServerName> serverNames;
291       try {
292         Configuration conf = context.getHaseIntegrationTestingUtility().getConfiguration();
293         table = new HTable(conf, tableName);
294         serverNames = table.getRegionLocations().values();
295       } catch (IOException e) {
296         LOG.debug("Error creating HTable used to get list of region locations.", e);
297         return;
298       } finally {
299         if (table != null) {
300           table.close();
301         }
302       }
303       Random random = new Random();
304       ServerName[] nameArray = serverNames.toArray(new ServerName[serverNames.size()]);
305       restartRs(nameArray[random.nextInt(nameArray.length)], sleepTime);
306     }
307   }
308 
309   public static class MoveRegionsOfTable extends Action {
310     private final long sleepTime;
311     private final byte[] tableNameBytes;
312 
313     public MoveRegionsOfTable(long sleepTime, String tableName) {
314       this.sleepTime = sleepTime;
315       this.tableNameBytes = Bytes.toBytes(tableName);
316     }
317 
318     @Override
319     public void perform() throws Exception {
320       try {
321         HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin();
322         List<HRegionInfo> regions = admin.getTableRegions(tableNameBytes);
323         Collection<ServerName> serversList = admin.getClusterStatus().getServers();
324         ServerName[] servers = serversList.toArray(new ServerName[serversList.size()]);
325         Random random = new Random();
326         for (HRegionInfo regionInfo:regions) {
327           try {
328             byte[] destServerName =
329               Bytes.toBytes(servers[random.nextInt(servers.length)].getServerName());
330             admin.move(regionInfo.getRegionName(), destServerName);
331           } catch (Exception e) {
332             LOG.debug("Error moving region", e);
333           }
334         }
335         Thread.sleep(sleepTime);
336       } catch (Exception e) {
337         LOG.debug("Error performing MoveRegionsOfTable", e);
338       }
339     }
340   }
341 
342   /**
343    * Restarts a ratio of the running regionservers at the same time
344    */
345   public static class BatchRestartRs extends RestartActionBase {
346     float ratio; //ratio of regionservers to restart
347 
348     public BatchRestartRs(long sleepTime, float ratio) {
349       super(sleepTime);
350       this.ratio = ratio;
351     }
352 
353     @Override
354     public void perform() throws Exception {
355       LOG.info(String.format("Performing action: Batch restarting %d%% of region servers",
356           (int)(ratio * 100)));
357       List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
358 
359       for (ServerName server : selectedServers) {
360         LOG.info("Killing region server:" + server);
361         cluster.killRegionServer(server);
362       }
363 
364       for (ServerName server : selectedServers) {
365         cluster.waitForRegionServerToStop(server, TIMEOUT);
366       }
367 
368       LOG.info("Killed " + selectedServers.size() + " region servers. Reported num of rs:"
369           + cluster.getClusterStatus().getServersSize());
370 
371       sleep(sleepTime);
372 
373       for (ServerName server : selectedServers) {
374         LOG.info("Starting region server:" + server.getHostname());
375         cluster.startRegionServer(server.getHostname());
376 
377       }
378       for (ServerName server : selectedServers) {
379         cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT);
380       }
381       LOG.info("Started " + selectedServers.size() +" region servers. Reported num of rs:"
382           + cluster.getClusterStatus().getServersSize());
383     }
384   }
385 
386   /**
387    * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
388    * server, or starts one, sleeping randomly (0-sleepTime) in between steps.
389    */
390   public static class RollingBatchRestartRs extends BatchRestartRs {
391     public RollingBatchRestartRs(long sleepTime, float ratio) {
392       super(sleepTime, ratio);
393     }
394 
395     @Override
396     public void perform() throws Exception {
397       Random random = new Random();
398       LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
399           (int)(ratio * 100)));
400       List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
401 
402       Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
403       Queue<ServerName> deadServers = new LinkedList<ServerName>();
404 
405       //
406       while (!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) {
407         boolean action = true; //action true = kill server, false = start server
408 
409         if (serversToBeKilled.isEmpty() || deadServers.isEmpty()) {
410           action = deadServers.isEmpty();
411         } else {
412           action = random.nextBoolean();
413         }
414 
415         if (action) {
416           ServerName server = serversToBeKilled.remove();
417           killRs(server);
418           deadServers.add(server);
419         } else {
420           ServerName server = deadServers.remove();
421           startRs(server);
422         }
423 
424         sleep(random.nextInt((int)sleepTime));
425       }
426     }
427   }
428 
429   public static class UnbalanceRegionsAction extends Action {
430     private double fractionOfRegions;
431     private double fractionOfServers;
432     private Random random = new Random();
433 
434     /**
435      * Unbalances the regions on the cluster by choosing "target" servers, and moving
436      * some regions from each of the non-target servers to random target servers.
437      * @param fractionOfRegions Fraction of regions to move from each server.
438      * @param fractionOfServers Fraction of servers to be chosen as targets.
439      */
440     public UnbalanceRegionsAction(double fractionOfRegions, double fractionOfServers) {
441       this.fractionOfRegions = fractionOfRegions;
442       this.fractionOfServers = fractionOfServers;
443     }
444 
445     @Override
446     public void perform() throws Exception {
447       LOG.info("Unbalancing regions");
448       ClusterStatus status = this.cluster.getClusterStatus();
449       List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
450       int targetServerCount = (int)Math.ceil(fractionOfServers * victimServers.size());
451       List<byte[]> targetServers = new ArrayList<byte[]>(targetServerCount);
452       for (int i = 0; i < targetServerCount; ++i) {
453         int victimIx = random.nextInt(victimServers.size());
454         String serverName = victimServers.remove(victimIx).getServerName();
455         targetServers.add(Bytes.toBytes(serverName));
456       }
457 
458       List<byte[]> victimRegions = new LinkedList<byte[]>();
459       for (ServerName server : victimServers) {
460         HServerLoad serverLoad = status.getLoad(server);
461         // Ugh.
462         List<byte[]> regions = new LinkedList<byte[]>(serverLoad.getRegionsLoad().keySet());
463         int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
464         LOG.debug("Removing " + victimRegionCount + " regions from " + server.getServerName());
465         for (int i = 0; i < victimRegionCount; ++i) {
466           int victimIx = random.nextInt(regions.size());
467           String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
468           victimRegions.add(Bytes.toBytes(regionId));
469         }
470       }
471 
472       LOG.info("Moving " + victimRegions.size() + " regions from " + victimServers.size()
473           + " servers to " + targetServers.size() + " different servers");
474       HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin();
475       for (byte[] victimRegion : victimRegions) {
476         int targetIx = random.nextInt(targetServers.size());
477         admin.move(victimRegion, targetServers.get(targetIx));
478       }
479     }
480   }
481 
482   public static class ForceBalancerAction extends Action {
483     @Override
484     public void perform() throws Exception {
485       LOG.info("Balancing regions");
486       HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin();
487       boolean result = admin.balancer();
488       if (!result) {
489         LOG.error("Balancer didn't succeed");
490       }
491     }
492   }
493 
494   /**
495    * A context for a Policy
496    */
497   public static class PolicyContext extends ActionContext {
498     public PolicyContext(IntegrationTestingUtility util) {
499       super(util);
500     }
501   }
502 
503   /**
504    * A policy to introduce chaos to the cluster
505    */
506   public static abstract class Policy extends StoppableImplementation implements Runnable {
507     protected PolicyContext context;
508     public void init(PolicyContext context) throws Exception {
509       this.context = context;
510     }
511   }
512 
513   /** A policy that runs multiple other policies one after the other */
514   public static class CompositeSequentialPolicy extends Policy {
515     private List<Policy> policies;
516     public CompositeSequentialPolicy(Policy... policies) {
517       this.policies = Arrays.asList(policies);
518     }
519 
520     @Override
521     public void stop(String why) {
522       super.stop(why);
523       for (Policy p : policies) {
524         p.stop(why);
525       }
526     }
527 
528     @Override
529     public void run() {
530       for (Policy p : policies) {
531         p.run();
532       }
533     }
534 
535     @Override
536     public void init(PolicyContext context) throws Exception {
537       super.init(context);
538       for (Policy p : policies) {
539         p.init(context);
540       }
541     }
542   }
543 
544   /** A policy which does stuff every time interval. */
545   public static abstract class PeriodicPolicy extends Policy {
546     private long periodMs;
547 
548     public PeriodicPolicy(long periodMs) {
549       this.periodMs = periodMs;
550     }
551 
552     @Override
553     public void run() {
554       // Add some jitter.
555       int jitter = new Random().nextInt((int)periodMs);
556       LOG.info("Sleeping for " + jitter + " to add jitter");
557       Threads.sleep(jitter);
558 
559       while (!isStopped()) {
560         long start = System.currentTimeMillis();
561         runOneIteration();
562 
563         if (isStopped()) return;
564         long sleepTime = periodMs - (System.currentTimeMillis() - start);
565         if (sleepTime > 0) {
566           LOG.info("Sleeping for: " + sleepTime);
567           Threads.sleep(sleepTime);
568         }
569       }
570     }
571 
572     protected abstract void runOneIteration();
573 
574     @Override
575     public void init(PolicyContext context) throws Exception {
576       super.init(context);
577       LOG.info("Using ChaosMonkey Policy: " + this.getClass() + ", period: " + periodMs);
578     }
579   }
580 
581   /** A policy which performs a sequence of actions deterministically. */
582   public static class DoActionsOncePolicy extends PeriodicPolicy {
583     private List<Action> actions;
584 
585     public DoActionsOncePolicy(long periodMs, List<Action> actions) {
586       super(periodMs);
587       this.actions = new ArrayList<ChaosMonkey.Action>(actions);
588     }
589 
590     public DoActionsOncePolicy(long periodMs, Action... actions) {
591       this(periodMs, Arrays.asList(actions));
592     }
593 
594     @Override
595     protected void runOneIteration() {
596       if (actions.isEmpty()) {
597         this.stop("done");
598         return;
599       }
600       Action action = actions.remove(0);
601 
602       try {
603         action.perform();
604       } catch (Exception ex) {
605         LOG.warn("Exception occured during performing action: "
606             + StringUtils.stringifyException(ex));
607       }
608     }
609 
610     @Override
611     public void init(PolicyContext context) throws Exception {
612       super.init(context);
613       for (Action action : actions) {
614         action.init(this.context);
615       }
616     }
617   }
618 
619   /**
620    * A policy, which picks a random action according to the given weights,
621    * and performs it every configurable period.
622    */
623   public static class PeriodicRandomActionPolicy extends PeriodicPolicy {
624     private List<Pair<Action, Integer>> actions;
625 
626     public PeriodicRandomActionPolicy(long periodMs, List<Pair<Action, Integer>> actions) {
627       super(periodMs);
628       this.actions = actions;
629     }
630 
631     public PeriodicRandomActionPolicy(long periodMs, Pair<Action, Integer>... actions) {
632       // We don't expect it to be modified.
633       this(periodMs, Arrays.asList(actions));
634     }
635 
636     public PeriodicRandomActionPolicy(long periodMs, Action... actions) {
637       super(periodMs);
638       this.actions = new ArrayList<Pair<Action, Integer>>(actions.length);
639       for (Action action : actions) {
640         this.actions.add(new Pair<Action, Integer>(action, 1));
641       }
642     }
643 
644     @Override
645     protected void runOneIteration() {
646       Action action = selectWeightedRandomItem(actions);
647       try {
648         action.perform();
649       } catch (Exception ex) {
650         LOG.warn("Exception occured during performing action: "
651             + StringUtils.stringifyException(ex));
652       }
653     }
654 
655     @Override
656     public void init(PolicyContext context) throws Exception {
657       super.init(context);
658       for (Pair<Action, Integer> action : actions) {
659         action.getFirst().init(this.context);
660       }
661     }
662   }
663 
664   /** Selects a random item from the given items */
665   static <T> T selectRandomItem(T[] items) {
666     Random random = new Random();
667     return items[random.nextInt(items.length)];
668   }
669 
670   /** Selects a random item from the given items with weights*/
671   static <T> T selectWeightedRandomItem(List<Pair<T, Integer>> items) {
672     Random random = new Random();
673     int totalWeight = 0;
674     for (Pair<T, Integer> pair : items) {
675       totalWeight += pair.getSecond();
676     }
677 
678     int cutoff = random.nextInt(totalWeight);
679     int cummulative = 0;
680     T item = null;
681 
682     //warn: O(n)
683     for (int i=0; i<items.size(); i++) {
684       int curWeight = items.get(i).getSecond();
685       if ( cutoff < cummulative + curWeight) {
686         item = items.get(i).getFirst();
687         break;
688       }
689       cummulative += curWeight;
690     }
691 
692     return item;
693   }
694 
695   /** Selects and returns ceil(ratio * items.length) random items from the given array */
696   static <T> List<T> selectRandomItems(T[] items, float ratio) {
697     Random random = new Random();
698     int remaining = (int)Math.ceil(items.length * ratio);
699 
700     List<T> selectedItems = new ArrayList<T>(remaining);
701 
702     for (int i=0; i<items.length && remaining > 0; i++) {
703       if (random.nextFloat() < ((float)remaining/(items.length-i))) {
704         selectedItems.add(items[i]);
705         remaining--;
706       }
707     }
708 
709     return selectedItems;
710   }
711 
712   /**
713    * All actions that deal with RS's with the following weights (relative probabilities):
714    *  - Restart active master (sleep 5 sec)                    : 2
715    *  - Restart random regionserver (sleep 5 sec)              : 2
716    *  - Restart random regionserver (sleep 60 sec)             : 2
717    *  - Restart META regionserver (sleep 5 sec)                : 1
718    *  - Restart ROOT regionserver (sleep 5 sec)                : 1
719    *  - Batch restart of 50% of regionservers (sleep 5 sec)    : 2
720    *  - Rolling restart of 100% of regionservers (sleep 5 sec) : 2
721    */
722   @SuppressWarnings("unchecked")
723   private static final List<Pair<Action, Integer>> ALL_ACTIONS = Lists.newArrayList(
724       new Pair<Action,Integer>(new RestartActiveMaster(FIVE_SEC), 2),
725       new Pair<Action,Integer>(new RestartRandomRs(FIVE_SEC), 2),
726       new Pair<Action,Integer>(new RestartRandomRs(ONE_MIN), 2),
727       new Pair<Action,Integer>(new RestartRsHoldingMeta(FIVE_SEC), 1),
728       new Pair<Action,Integer>(new RestartRsHoldingRoot(FIVE_SEC), 1),
729       new Pair<Action,Integer>(new BatchRestartRs(FIVE_SEC, 0.5f), 2),
730       new Pair<Action,Integer>(new RollingBatchRestartRs(FIVE_SEC, 1.0f), 2)
731   );
732 
733   public static final String EVERY_MINUTE_RANDOM_ACTION_POLICY = "EVERY_MINUTE_RANDOM_ACTION_POLICY";
734 
735   private Policy[] policies;
736   private Thread[] monkeyThreads;
737 
738   public void start() throws Exception {
739     monkeyThreads = new Thread[policies.length];
740 
741     for (int i=0; i<policies.length; i++) {
742       policies[i].init(new PolicyContext(this.util));
743       Thread monkeyThread = new Thread(policies[i]);
744       monkeyThread.start();
745       monkeyThreads[i] = monkeyThread;
746     }
747   }
748 
749   @Override
750   public void stop(String why) {
751     for (Policy policy : policies) {
752       policy.stop(why);
753     }
754   }
755 
756   @Override
757   public boolean isStopped() {
758     return policies[0].isStopped();
759   }
760 
761   /**
762    * Wait for ChaosMonkey to stop.
763    * @throws InterruptedException
764    */
765   public void waitForStop() throws InterruptedException {
766     for (Thread monkeyThread : monkeyThreads) {
767       monkeyThread.join();
768     }
769   }
770 
771   private static final Map<String, Policy> NAMED_POLICIES = Maps.newHashMap();
772   static {
773     NAMED_POLICIES.put(EVERY_MINUTE_RANDOM_ACTION_POLICY,
774         new PeriodicRandomActionPolicy(ONE_MIN, ALL_ACTIONS));
775   }
776 
777   @Override
778   protected void addOptions() {
779     addOptWithArg("policy", "a named policy defined in ChaosMonkey.java. Possible values: "
780         + NAMED_POLICIES.keySet());
781     //we can add more options, and make policies more configurable
782   }
783 
784   @Override
785   protected void processOptions(CommandLine cmd) {
786     String[] policies = cmd.getOptionValues("policy");
787     if (policies != null) {
788       setPoliciesByName(policies);
789     }
790   }
791 
792   @Override
793   protected int doWork() throws Exception {
794     start();
795     waitForStop();
796     return 0;
797   }
798 
799   public static void main(String[] args) throws Exception {
800     Configuration conf = HBaseConfiguration.create();
801     IntegrationTestingUtility.setUseDistributedCluster(conf);
802     IntegrationTestingUtility util = new IntegrationTestingUtility(conf);
803     util.initializeCluster(1);
804 
805     ChaosMonkey monkey = new ChaosMonkey(util, EVERY_MINUTE_RANDOM_ACTION_POLICY);
806     int ret = ToolRunner.run(conf, monkey, args);
807     System.exit(ret);
808   }
809 
810 }