View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.mttr;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.concurrent.Callable;
24  import java.util.concurrent.ExecutorService;
25  import java.util.concurrent.Executors;
26  import java.util.concurrent.Future;
27  import java.util.concurrent.TimeUnit;
28  
29  import com.google.common.base.Objects;
30  import org.apache.commons.lang.RandomStringUtils;
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
34  import org.apache.hadoop.hbase.ClusterStatus;
35  import org.apache.hadoop.hbase.HColumnDescriptor;
36  import org.apache.hadoop.hbase.HTableDescriptor;
37  import org.apache.hadoop.hbase.IntegrationTestingUtility;
38  import org.apache.hadoop.hbase.IntegrationTests;
39  import org.apache.hadoop.hbase.TableName;
40  import org.apache.hadoop.hbase.chaos.actions.Action;
41  import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
42  import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
43  import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
44  import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingTableAction;
45  import org.apache.hadoop.hbase.client.HBaseAdmin;
46  import org.apache.hadoop.hbase.client.HTable;
47  import org.apache.hadoop.hbase.client.Put;
48  import org.apache.hadoop.hbase.client.Result;
49  import org.apache.hadoop.hbase.client.ResultScanner;
50  import org.apache.hadoop.hbase.client.Scan;
51  import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
52  import org.apache.hadoop.hbase.util.Bytes;
53  import org.apache.hadoop.hbase.util.LoadTestTool;
54  import org.cloudera.htrace.Sampler;
55  import org.cloudera.htrace.Span;
56  import org.cloudera.htrace.Trace;
57  import org.cloudera.htrace.TraceScope;
58  import org.cloudera.htrace.impl.AlwaysSampler;
59  import org.junit.AfterClass;
60  import org.junit.BeforeClass;
61  import org.junit.Test;
62  import org.junit.experimental.categories.Category;
63  
64  import static junit.framework.Assert.assertEquals;
65  
66  /**
67   * Integration test that should benchmark how fast HBase can recover from failures. This test starts
68   * different threads:
69   * <ol>
70   * <li>
71   * Load Test Tool.<br/>
72   * This runs so that all RegionServers will have some load and HLogs will be full.
73   * </li>
74   * <li>
75   * Scan thread.<br/>
76   * This thread runs a very short scan over and over again recording how log it takes to respond.
77   * The longest response is assumed to be the time it took to recover.
78   * </li>
79   * <li>
80   * Put thread.<br/>
81   * This thread just like the scan thread except it does a very small put.
82   * </li>
83   * <li>
84   * Admin thread. <br/>
85   * This thread will continually go to the master to try and get the cluster status.  Just like the
86   * put and scan threads, the time to respond is recorded.
87   * </li>
88   * <li>
89   * Chaos Monkey thread.<br/>
90   * This thread runs a ChaosMonkey.Action.
91   * </li>
92   * </ol>
93   * <p/>
94   * The ChaosMonkey actions currently run are:
95   * <ul>
96   * <li>Restart the RegionServer holding meta.</li>
97   * <li>Restart the RegionServer holding the table the scan and put threads are targeting.</li>
98   * <li>Move the Regions of the table used by the scan and put threads.</li>
99   * <li>Restart the master.</li>
100  * </ul>
101  * <p/>
102  * At the end of the test a log line is output on the INFO level containing the timing data that was
103  * collected.
104  */
105 @Category(IntegrationTests.class)
106 public class IntegrationTestMTTR {
107   /**
108    * Constants.
109    */
110   private static final byte[] FAMILY = Bytes.toBytes("d");
111   private static final Log LOG = LogFactory.getLog(IntegrationTestMTTR.class);
112   private static final long SLEEP_TIME = 60 * 1000l;
113 
114   /**
115    * Configurable table names.
116    */
117   private static TableName tableName;
118   private static TableName loadTableName;
119 
120   /**
121    * Util to get at the cluster.
122    */
123   private static IntegrationTestingUtility util;
124 
125   /**
126    * Executor for test threads.
127    */
128   private static ExecutorService executorService;
129 
130   /**
131    * All of the chaos monkey actions used.
132    */
133   private static Action restartRSAction;
134   private static Action restartMetaAction;
135   private static Action moveRegionAction;
136   private static Action restartMasterAction;
137 
138   /**
139    * The load test tool used to create load and make sure that HLogs aren't empty.
140    */
141   private static LoadTestTool loadTool;
142 
143 
144   @BeforeClass
145   public static void setUp() throws Exception {
146     // Set up the integration test util
147     if (util == null) {
148       util = new IntegrationTestingUtility();
149     }
150 
151     // Make sure there are three servers.
152     util.initializeCluster(3);
153 
154     // Set up the load test tool.
155     loadTool = new LoadTestTool();
156     loadTool.setConf(util.getConfiguration());
157 
158     // Create executor with enough threads to restart rs's,
159     // run scans, puts, admin ops and load test tool.
160     executorService = Executors.newFixedThreadPool(8);
161 
162     // Set up the tables needed.
163     setupTables();
164 
165     // Set up the actions.
166     setupActions();
167   }
168 
169   private static void setupActions() throws IOException {
170     // Set up the action that will restart a region server holding a region from our table
171     // because this table should only have one region we should be good.
172     restartRSAction = new RestartRsHoldingTableAction(SLEEP_TIME, tableName.getNameAsString());
173 
174     // Set up the action that will kill the region holding meta.
175     restartMetaAction = new RestartRsHoldingMetaAction(SLEEP_TIME);
176 
177     // Set up the action that will move the regions of our table.
178     moveRegionAction = new MoveRegionsOfTableAction(SLEEP_TIME, tableName.getNameAsString());
179 
180     // Kill the master
181     restartMasterAction = new RestartActiveMasterAction(1000);
182 
183     // Give the action the access to the cluster.
184     Action.ActionContext actionContext = new Action.ActionContext(util);
185     restartRSAction.init(actionContext);
186     restartMetaAction.init(actionContext);
187     moveRegionAction.init(actionContext);
188     restartMasterAction.init(actionContext);
189   }
190 
191   private static void setupTables() throws IOException {
192     // Get the table name.
193     tableName = TableName.valueOf(util.getConfiguration()
194         .get("hbase.IntegrationTestMTTR.tableName", "IntegrationTestMTTR"));
195 
196     loadTableName = TableName.valueOf(util.getConfiguration()
197         .get("hbase.IntegrationTestMTTR.loadTableName", "IntegrationTestMTTRLoadTestTool"));
198 
199     if (util.getHBaseAdmin().tableExists(tableName)) {
200       util.deleteTable(tableName);
201     }
202 
203     if (util.getHBaseAdmin().tableExists(loadTableName)) {
204       util.deleteTable(loadTableName);
205     }
206 
207     // Create the table.  If this fails then fail everything.
208     HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);
209 
210     // Make the max file size huge so that splits don't happen during the test.
211     tableDescriptor.setMaxFileSize(Long.MAX_VALUE);
212 
213     HColumnDescriptor descriptor = new HColumnDescriptor(FAMILY);
214     descriptor.setMaxVersions(1);
215     tableDescriptor.addFamily(descriptor);
216     util.getHBaseAdmin().createTable(tableDescriptor);
217 
218     // Setup the table for LoadTestTool
219     int ret = loadTool.run(new String[]{"-tn", loadTableName.getNameAsString(), "-init_only"});
220     assertEquals("Failed to initialize LoadTestTool", 0, ret);
221   }
222 
223   @AfterClass
224   public static void after() throws IOException {
225     // Clean everything up.
226     util.restoreCluster();
227     util = null;
228 
229     // Stop the threads so that we know everything is complete.
230     executorService.shutdown();
231     executorService = null;
232 
233     // Clean up the actions.
234     moveRegionAction = null;
235     restartMetaAction = null;
236     restartRSAction = null;
237     restartMasterAction = null;
238 
239     loadTool = null;
240   }
241 
242   @Test
243   public void testRestartRsHoldingTable() throws Exception {
244     run(new ActionCallable(restartRSAction), "RestartRsHoldingTableAction");
245   }
246 
247   @Test
248   public void testKillRsHoldingMeta() throws Exception {
249     run(new ActionCallable(restartMetaAction), "KillRsHoldingMeta");
250   }
251 
252   @Test
253   public void testMoveRegion() throws Exception {
254     run(new ActionCallable(moveRegionAction), "MoveRegion");
255   }
256 
257   @Test
258   public void testRestartMaster() throws Exception {
259     run(new ActionCallable(restartMasterAction), "RestartMaster");
260   }
261 
262   public void run(Callable<Boolean> monkeyCallable, String testName) throws Exception {
263     int maxIters = util.getHBaseClusterInterface().isDistributedCluster() ? 10 : 3;
264 
265     // Array to keep track of times.
266     ArrayList<TimingResult> resultPuts = new ArrayList<TimingResult>(maxIters);
267     ArrayList<TimingResult> resultScan = new ArrayList<TimingResult>(maxIters);
268     ArrayList<TimingResult> resultAdmin = new ArrayList<TimingResult>(maxIters);
269     long start = System.nanoTime();
270 
271     // We're going to try this multiple times
272     for (int fullIterations = 0; fullIterations < maxIters; fullIterations++) {
273       // Create and start executing a callable that will kill the servers
274       Future<Boolean> monkeyFuture = executorService.submit(monkeyCallable);
275 
276       // Pass that future to the timing Callables.
277       Future<TimingResult> putFuture = executorService.submit(new PutCallable(monkeyFuture));
278       Future<TimingResult> scanFuture = executorService.submit(new ScanCallable(monkeyFuture));
279       Future<TimingResult> adminFuture = executorService.submit(new AdminCallable(monkeyFuture));
280 
281       Future<Boolean> loadFuture = executorService.submit(new LoadCallable(monkeyFuture));
282 
283       monkeyFuture.get();
284       loadFuture.get();
285 
286       // Get the values from the futures.
287       TimingResult putTime = putFuture.get();
288       TimingResult scanTime = scanFuture.get();
289       TimingResult adminTime = adminFuture.get();
290 
291       // Store the times to display later.
292       resultPuts.add(putTime);
293       resultScan.add(scanTime);
294       resultAdmin.add(adminTime);
295 
296       // Wait some time for everything to settle down.
297       Thread.sleep(5000l);
298     }
299 
300     long runtimeMs = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
301 
302     Objects.ToStringHelper helper = Objects.toStringHelper("MTTRResults")
303         .add("putResults", resultPuts)
304         .add("scanResults", resultScan)
305         .add("adminResults", resultAdmin)
306         .add("totalRuntimeMs", runtimeMs)
307         .add("name", testName);
308 
309     // Log the info
310     LOG.info(helper.toString());
311   }
312 
313   /**
314    * Class to store results of TimingCallable.
315    *
316    * Stores times and trace id.
317    */
318   private class TimingResult {
319     DescriptiveStatistics stats = new DescriptiveStatistics();
320     ArrayList<Long> traces = new ArrayList<Long>(10);
321 
322     /**
323      * Add a result to this aggregate result.
324      * @param time Time in nanoseconds
325      * @param span Span.  To be kept if the time taken was over 1 second
326      */
327     public void addResult(long time, Span span) {
328       stats.addValue(TimeUnit.MILLISECONDS.convert(time, TimeUnit.NANOSECONDS));
329       if (TimeUnit.SECONDS.convert(time, TimeUnit.NANOSECONDS) >= 1) {
330         traces.add(span.getTraceId());
331       }
332     }
333 
334     public String toString() {
335       Objects.ToStringHelper helper = Objects.toStringHelper(this)
336           .add("numResults", stats.getN())
337           .add("minTime", stats.getMin())
338           .add("meanTime", stats.getMean())
339           .add("maxTime", stats.getMax())
340           .add("25th", stats.getPercentile(25))
341           .add("50th", stats.getPercentile(50))
342           .add("75th", stats.getPercentile(75))
343           .add("90th", stats.getPercentile(90))
344           .add("95th", stats.getPercentile(95))
345           .add("99th", stats.getPercentile(99))
346           .add("99.9th", stats.getPercentile(99.9))
347           .add("99.99th", stats.getPercentile(99.99))
348           .add("traces", traces);
349       return helper.toString();
350     }
351   }
352 
353   /**
354    * Base class for actions that need to record the time needed to recover from a failure.
355    */
356   public abstract class TimingCallable implements Callable<TimingResult> {
357     protected final Future future;
358 
359     public TimingCallable(Future f) {
360       future = f;
361     }
362 
363     @Override
364     public TimingResult call() throws Exception {
365       TimingResult result = new TimingResult();
366       int numAfterDone = 0;
367       // Keep trying until the rs is back up and we've gotten a put through
368       while (numAfterDone < 10) {
369         long start = System.nanoTime();
370         TraceScope scope = null;
371         try {
372           scope = Trace.startSpan(getSpanName(), AlwaysSampler.INSTANCE);
373           boolean actionResult = doAction();
374           if (actionResult && future.isDone()) {
375             numAfterDone ++;
376           }
377         } catch (Exception e) {
378           numAfterDone = 0;
379         } finally {
380           if (scope != null) {
381             scope.close();
382           }
383         }
384         result.addResult(System.nanoTime() - start, scope.getSpan());
385       }
386       return result;
387     }
388 
389     protected abstract boolean doAction() throws Exception;
390 
391     protected String getSpanName() {
392       return this.getClass().getSimpleName();
393     }
394   }
395 
396   /**
397    * Callable that will keep putting small amounts of data into a table
398    * until  the future supplied returns.  It keeps track of the max time.
399    */
400   public class PutCallable extends TimingCallable {
401 
402     private final HTable table;
403 
404     public PutCallable(Future f) throws IOException {
405       super(f);
406       this.table = new HTable(util.getConfiguration(), tableName);
407     }
408 
409     @Override
410     protected boolean doAction() throws Exception {
411       Put p = new Put(Bytes.toBytes(RandomStringUtils.randomAlphanumeric(5)));
412       p.add(FAMILY, Bytes.toBytes("\0"), Bytes.toBytes(RandomStringUtils.randomAscii(5)));
413       table.put(p);
414       table.flushCommits();
415       return true;
416     }
417 
418     @Override
419     protected String getSpanName() {
420       return "MTTR Put Test";
421     }
422   }
423 
424   /**
425    * Callable that will keep scanning for small amounts of data until the
426    * supplied future returns.  Returns the max time taken to scan.
427    */
428   public class ScanCallable extends TimingCallable {
429     private final HTable table;
430 
431     public ScanCallable(Future f) throws IOException {
432       super(f);
433       this.table = new HTable(util.getConfiguration(), tableName);
434     }
435 
436     @Override
437     protected boolean doAction() throws Exception {
438       ResultScanner rs = null;
439       try {
440       Scan s = new Scan();
441       s.setBatch(2);
442       s.addFamily(FAMILY);
443       s.setFilter(new KeyOnlyFilter());
444       s.setMaxVersions(1);
445 
446       rs = table.getScanner(s);
447       Result result = rs.next();
448       return rs != null && result != null && result.size() > 0;
449       } finally {
450         if (rs != null) {
451           rs.close();
452         }
453       }
454     }
455     @Override
456     protected String getSpanName() {
457       return "MTTR Scan Test";
458     }
459   }
460 
461   /**
462    * Callable that will keep going to the master for cluster status.  Returns the max time taken.
463    */
464   public class AdminCallable extends TimingCallable {
465 
466     public AdminCallable(Future f) throws IOException {
467       super(f);
468     }
469 
470     @Override
471     protected boolean doAction() throws Exception {
472       HBaseAdmin admin = new HBaseAdmin(util.getConfiguration());
473       ClusterStatus status = admin.getClusterStatus();
474       return status != null;
475     }
476 
477     @Override
478     protected String getSpanName() {
479       return "MTTR Admin Test";
480     }
481   }
482 
483 
484   public class ActionCallable implements Callable<Boolean> {
485     private final Action action;
486 
487     public ActionCallable(Action action) {
488       this.action = action;
489     }
490 
491     @Override
492     public Boolean call() throws Exception {
493       this.action.perform();
494       return true;
495     }
496   }
497 
498   /**
499    * Callable used to make sure the cluster has some load on it.
500    * This callable uses LoadTest tool to
501    */
502   public class LoadCallable implements Callable<Boolean> {
503 
504     private final Future future;
505 
506     public LoadCallable(Future f) {
507       future = f;
508     }
509 
510     @Override
511     public Boolean call() throws Exception {
512       int colsPerKey = 10;
513       int recordSize = 500;
514       int numServers = util.getHBaseClusterInterface().getInitialClusterStatus().getServersSize();
515       int numKeys = numServers * 5000;
516       int writeThreads = 10;
517 
518 
519       // Loop until the chaos monkey future is done.
520       // But always go in just in case some action completes quickly
521       do {
522         int ret = loadTool.run(new String[]{
523             "-tn", loadTableName.getNameAsString(),
524             "-write", String.format("%d:%d:%d", colsPerKey, recordSize, writeThreads),
525             "-num_keys", String.valueOf(numKeys),
526             "-skip_init"
527         });
528         assertEquals("Load failed", 0, ret);
529       } while (!future.isDone());
530 
531       return true;
532     }
533   }
534 }