View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.tool;
21  
22  import java.io.Closeable;
23  import java.io.IOException;
24  import java.util.ArrayList;
25  import java.util.Arrays;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.LinkedList;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.Map.Entry;
32  import java.util.Random;
33  import java.util.Set;
34  import java.util.TreeSet;
35  import java.util.concurrent.Callable;
36  import java.util.concurrent.ExecutionException;
37  import java.util.concurrent.ExecutorService;
38  import java.util.concurrent.Future;
39  import java.util.concurrent.ScheduledThreadPoolExecutor;
40  import java.util.regex.Matcher;
41  import java.util.regex.Pattern;
42  
43  import org.apache.commons.lang.time.StopWatch;
44  import org.apache.commons.logging.Log;
45  import org.apache.commons.logging.LogFactory;
46  import org.apache.hadoop.conf.Configuration;
47  import org.apache.hadoop.hbase.AuthUtil;
48  import org.apache.hadoop.hbase.DoNotRetryIOException;
49  import org.apache.hadoop.hbase.HBaseConfiguration;
50  import org.apache.hadoop.hbase.HColumnDescriptor;
51  import org.apache.hadoop.hbase.HConstants;
52  import org.apache.hadoop.hbase.HRegionInfo;
53  import org.apache.hadoop.hbase.HRegionLocation;
54  import org.apache.hadoop.hbase.HTableDescriptor;
55  import org.apache.hadoop.hbase.NamespaceDescriptor;
56  import org.apache.hadoop.hbase.ServerName;
57  import org.apache.hadoop.hbase.TableName;
58  import org.apache.hadoop.hbase.TableNotEnabledException;
59  import org.apache.hadoop.hbase.TableNotFoundException;
60  import org.apache.hadoop.hbase.client.Get;
61  import org.apache.hadoop.hbase.client.HBaseAdmin;
62  import org.apache.hadoop.hbase.client.HConnection;
63  import org.apache.hadoop.hbase.client.HConnectionManager;
64  import org.apache.hadoop.hbase.client.HTable;
65  import org.apache.hadoop.hbase.client.HTableInterface;
66  import org.apache.hadoop.hbase.client.Put;
67  import org.apache.hadoop.hbase.client.ResultScanner;
68  import org.apache.hadoop.hbase.client.Scan;
69  import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
70  import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
71  import org.apache.hadoop.hbase.util.Bytes;
72  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
73  import org.apache.hadoop.hbase.util.ReflectionUtils;
74  import org.apache.hadoop.hbase.util.RegionSplitter;
75  import org.apache.hadoop.util.Tool;
76  import org.apache.hadoop.util.ToolRunner;
77  
78  import com.google.protobuf.ServiceException;
79  
80  /**
81   * HBase Canary Tool, that that can be used to do
82   * "canary monitoring" of a running HBase cluster.
83   *
84   * Here are two modes
85   * 1. region mode - Foreach region tries to get one row per column family
86   * and outputs some information about failure or latency.
87   *
88   * 2. regionserver mode - Foreach regionserver tries to get one row from one table
89   * selected randomly and outputs some information about failure or latency.
90   */
91  public final class Canary implements Tool {
92    // Sink interface used by the canary to outputs information
93    public interface Sink {
94      public void publishReadFailure(HRegionInfo region, Exception e);
95      public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
96      public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
97      public void publishWriteFailure(HRegionInfo region, Exception e);
98      public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
99      public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
100   }
101   // new extended sink for output regionserver mode info
102   // do not change the Sink interface directly due to maintaining the API
103   public interface ExtendedSink extends Sink {
104     public void publishReadFailure(String table, String server);
105     public void publishReadTiming(String table, String server, long msTime);
106   }
107 
108   // Simple implementation of canary sink that allows to plot on
109   // file or standard output timings or failures.
110   public static class StdOutSink implements Sink {
111     @Override
112     public void publishReadFailure(HRegionInfo region, Exception e) {
113       LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()), e);
114     }
115 
116     @Override
117     public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
118       LOG.error(String.format("read from region %s column family %s failed",
119                 region.getRegionNameAsString(), column.getNameAsString()), e);
120     }
121 
122     @Override
123     public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime) {
124       LOG.info(String.format("read from region %s column family %s in %dms",
125                region.getRegionNameAsString(), column.getNameAsString(), msTime));
126     }
127 
128     @Override
129     public void publishWriteFailure(HRegionInfo region, Exception e) {
130       LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()), e);
131     }
132 
133     @Override
134     public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
135       LOG.error(String.format("write to region %s column family %s failed",
136         region.getRegionNameAsString(), column.getNameAsString()), e);
137     }
138 
139     @Override
140     public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime) {
141       LOG.info(String.format("write to region %s column family %s in %dms",
142         region.getRegionNameAsString(), column.getNameAsString(), msTime));
143     }
144   }
145   // a ExtendedSink implementation
146   public static class RegionServerStdOutSink extends StdOutSink implements ExtendedSink {
147 
148     @Override
149     public void publishReadFailure(String table, String server) {
150       LOG.error(String.format("Read from table:%s on region server:%s", table, server));
151     }
152 
153     @Override
154     public void publishReadTiming(String table, String server, long msTime) {
155       LOG.info(String.format("Read from table:%s on region server:%s in %dms",
156           table, server, msTime));
157     }
158   }
159 
160   /**
161    * For each column family of the region tries to get one row and outputs the latency, or the
162    * failure.
163    */
164   public static class RegionTask implements Callable<Void> {
165     public enum TaskType{
166       READ, WRITE
167     }
168     private HConnection connection;
169     private HRegionInfo region;
170     private Sink sink;
171     private TaskType taskType;
172 
173     RegionTask(HConnection connection, HRegionInfo region, Sink sink, TaskType taskType) {
174       this.connection = connection;
175       this.region = region;
176       this.sink = sink;
177       this.taskType = taskType;
178     }
179 
180     @Override
181     public Void call() {
182       switch (taskType) {
183       case READ:
184         return read();
185       case WRITE:
186         return write();
187       default:
188         return read();
189       }
190     }
191 
192     public Void read() {
193       HTableInterface table = null;
194       HTableDescriptor tableDesc = null;
195       try {
196         table = connection.getTable(region.getTable());
197         tableDesc = table.getTableDescriptor();
198       } catch (IOException e) {
199         LOG.debug("sniffRegion failed", e);
200         sink.publishReadFailure(region, e);
201         if (table != null) {
202           try {
203             table.close();
204           } catch (IOException ioe) {
205             LOG.error("Close table failed", e);
206           }
207         }
208         return null;
209       }
210 
211       byte[] startKey = null;
212       Get get = null;
213       Scan scan = null;
214       ResultScanner rs = null;
215       StopWatch stopWatch = new StopWatch();
216       for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
217         stopWatch.reset();
218         startKey = region.getStartKey();
219         // Can't do a get on empty start row so do a Scan of first element if any instead.
220         if (startKey.length > 0) {
221           get = new Get(startKey);
222           get.setCacheBlocks(false);
223           get.setFilter(new FirstKeyOnlyFilter());
224           get.addFamily(column.getName());
225         } else {
226           scan = new Scan();
227           scan.setCaching(1);
228           scan.setCacheBlocks(false);
229           scan.setFilter(new FirstKeyOnlyFilter());
230           scan.addFamily(column.getName());
231           scan.setMaxResultSize(1L);
232         }
233 
234         try {
235           if (startKey.length > 0) {
236             stopWatch.start();
237             table.get(get);
238             stopWatch.stop();
239             sink.publishReadTiming(region, column, stopWatch.getTime());
240           } else {
241             stopWatch.start();
242             rs = table.getScanner(scan);
243             stopWatch.stop();
244             sink.publishReadTiming(region, column, stopWatch.getTime());
245           }
246         } catch (Exception e) {
247           sink.publishReadFailure(region, column, e);
248         } finally {
249           if (rs != null) {
250             rs.close();
251           }
252           scan = null;
253           get = null;
254           startKey = null;
255         }
256       }
257       try {
258         table.close();
259       } catch (IOException e) {
260         LOG.error("Close table failed", e);
261       }
262       return null;
263     }
264 
265     /**
266      * Check writes for the canary table
267      * @return
268      */
269     private Void write() {
270       HTableInterface table = null;
271       HTableDescriptor tableDesc = null;
272       try {
273         table = connection.getTable(region.getTable());
274         tableDesc = table.getTableDescriptor();
275         byte[] rowToCheck = region.getStartKey();
276         if (rowToCheck.length == 0) {
277           rowToCheck = new byte[]{0x0};
278         }
279         int writeValueSize =
280             connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY, 10);
281         for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
282           Put put = new Put(rowToCheck);
283           byte[] value = new byte[writeValueSize];
284           Bytes.random(value);
285           put.add(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
286           try {
287             long startTime = System.currentTimeMillis();
288             table.put(put);
289             long time = System.currentTimeMillis() - startTime;
290             sink.publishWriteTiming(region, column, time);
291           } catch (Exception e) {
292             sink.publishWriteFailure(region, column, e);
293           }
294         }
295         table.close();
296       } catch (IOException e) {
297         sink.publishWriteFailure(region, e);
298       }
299       return null;
300     }
301   }
302 
303   /**
304    * Get one row from a region on the regionserver and outputs the latency, or the failure.
305    */
306   static class RegionServerTask implements Callable<Void> {
307     private HConnection connection;
308     private String serverName;
309     private HRegionInfo region;
310     private ExtendedSink sink;
311 
312     RegionServerTask(HConnection connection, String serverName, HRegionInfo region,
313         ExtendedSink sink) {
314       this.connection = connection;
315       this.serverName = serverName;
316       this.region = region;
317       this.sink = sink;
318     }
319 
320     @Override
321     public Void call() {
322       TableName tableName = null;
323       HTableInterface table = null;
324       Get get = null;
325       byte[] startKey = null;
326       Scan scan = null;
327       StopWatch stopWatch = new StopWatch();
328       // monitor one region on every region server
329       stopWatch.reset();
330       try {
331         tableName = region.getTable();
332         table = connection.getTable(tableName);
333         startKey = region.getStartKey();
334         // Can't do a get on empty start row so do a Scan of first element if any instead.
335         if (startKey.length > 0) {
336           get = new Get(startKey);
337           get.setCacheBlocks(false);
338           get.setFilter(new FirstKeyOnlyFilter());
339           stopWatch.start();
340           table.get(get);
341           stopWatch.stop();
342         } else {
343           scan = new Scan();
344           scan.setCacheBlocks(false);
345           scan.setFilter(new FirstKeyOnlyFilter());
346           scan.setCaching(1);
347           scan.setMaxResultSize(1L);
348           stopWatch.start();
349           ResultScanner s = table.getScanner(scan);
350           s.close();
351           stopWatch.stop();
352         }
353         sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
354       } catch (TableNotFoundException tnfe) {
355         LOG.error("Table may be deleted", tnfe);
356         // This is ignored because it doesn't imply that the regionserver is dead
357       } catch (TableNotEnabledException tnee) {
358         // This is considered a success since we got a response.
359         LOG.debug("The targeted table was disabled.  Assuming success.");
360       } catch (DoNotRetryIOException dnrioe) {
361         sink.publishReadFailure(tableName.getNameAsString(), serverName);
362         LOG.error(dnrioe);
363       } catch (IOException e) {
364         sink.publishReadFailure(tableName.getNameAsString(), serverName);
365         LOG.error(e);
366       } finally {
367         if (table != null) {
368           try {
369             table.close();
370           } catch (IOException e) {/* DO NOTHING */
371             LOG.error("Close table failed", e);
372           }
373         }
374         scan = null;
375         get = null;
376         startKey = null;
377       }
378       return null;
379     }
380   }
381 
382   private static final int USAGE_EXIT_CODE = 1;
383   private static final int INIT_ERROR_EXIT_CODE = 2;
384   private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
385   private static final int ERROR_EXIT_CODE = 4;
386 
387   private static final long DEFAULT_INTERVAL = 6000;
388 
389   private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
390   private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
391 
392   private static final Log LOG = LogFactory.getLog(Canary.class);
393 
394   public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
395     NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
396 
397   private static final String CANARY_TABLE_FAMILY_NAME = "Test";
398 
399   private Configuration conf = null;
400   private long interval = 0;
401   private Sink sink = null;
402 
403   private boolean useRegExp;
404   private long timeout = DEFAULT_TIMEOUT;
405   private boolean failOnError = true;
406   private boolean regionServerMode = false;
407   private boolean writeSniffing = false;
408   private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
409 
410   private ExecutorService executor; // threads to retrieve data from regionservers
411 
412   public Canary() {
413     this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink());
414   }
415 
416   public Canary(ExecutorService executor, Sink sink) {
417     this.executor = executor;
418     this.sink = sink;
419   }
420 
421   @Override
422   public Configuration getConf() {
423     return conf;
424   }
425 
426   @Override
427   public void setConf(Configuration conf) {
428     this.conf = conf;
429   }
430 
431   private int parseArgs(String[] args) {
432     int index = -1;
433     // Process command line args
434     for (int i = 0; i < args.length; i++) {
435       String cmd = args[i];
436 
437       if (cmd.startsWith("-")) {
438         if (index >= 0) {
439           // command line args must be in the form: [opts] [table 1 [table 2 ...]]
440           System.err.println("Invalid command line options");
441           printUsageAndExit();
442         }
443 
444         if (cmd.equals("-help")) {
445           // user asked for help, print the help and quit.
446           printUsageAndExit();
447         } else if (cmd.equals("-daemon") && interval == 0) {
448           // user asked for daemon mode, set a default interval between checks
449           interval = DEFAULT_INTERVAL;
450         } else if (cmd.equals("-interval")) {
451           // user has specified an interval for canary breaths (-interval N)
452           i++;
453 
454           if (i == args.length) {
455             System.err.println("-interval needs a numeric value argument.");
456             printUsageAndExit();
457           }
458 
459           try {
460             interval = Long.parseLong(args[i]) * 1000;
461           } catch (NumberFormatException e) {
462             System.err.println("-interval needs a numeric value argument.");
463             printUsageAndExit();
464           }
465         } else if(cmd.equals("-regionserver")) {
466           this.regionServerMode = true;
467         } else if(cmd.equals("-writeSniffing")) {
468           this.writeSniffing = true;
469         } else if (cmd.equals("-e")) {
470           this.useRegExp = true;
471         } else if (cmd.equals("-t")) {
472           i++;
473 
474           if (i == args.length) {
475             System.err.println("-t needs a numeric value argument.");
476             printUsageAndExit();
477           }
478 
479           try {
480             this.timeout = Long.parseLong(args[i]);
481           } catch (NumberFormatException e) {
482             System.err.println("-t needs a numeric value argument.");
483             printUsageAndExit();
484           }
485         } else if (cmd.equals("-writeTable")) {
486           i++;
487 
488           if (i == args.length) {
489             System.err.println("-writeTable needs a string value argument.");
490             printUsageAndExit();
491           }
492           this.writeTableName = TableName.valueOf(args[i]);
493         } else if (cmd.equals("-f")) {
494           i++;
495 
496           if (i == args.length) {
497             System.err
498                 .println("-f needs a boolean value argument (true|false).");
499             printUsageAndExit();
500           }
501 
502           this.failOnError = Boolean.parseBoolean(args[i]);
503         } else {
504           // no options match
505           System.err.println(cmd + " options is invalid.");
506           printUsageAndExit();
507         }
508       } else if (index < 0) {
509         // keep track of first table name specified by the user
510         index = i;
511       }
512     }
513     return index;
514   }
515 
516   @Override
517   public int run(String[] args) throws Exception {
518     int index = parseArgs(args);
519 
520     // Launches chore for refreshing kerberos credentials if security is enabled.
521     // Please see http://hbase.apache.org/book.html#_running_canary_in_a_kerberos_enabled_cluster
522     // for more details.
523     AuthUtil.launchAuthChore(conf);
524 
525     // Start to prepare the stuffs
526     Monitor monitor = null;
527     Thread monitorThread = null;
528     long startTime = 0;
529     long currentTimeLength = 0;
530     // Get a connection to use in below.
531     HConnection connection = HConnectionManager.createConnection(this.conf);
532     try {
533       do {
534         // Do monitor !!
535         try {
536           monitor = this.newMonitor(connection, index, args);
537           monitorThread = new Thread(monitor);
538           startTime = System.currentTimeMillis();
539           monitorThread.start();
540           while (!monitor.isDone()) {
541             // wait for 1 sec
542             Thread.sleep(1000);
543             // exit if any error occurs
544             if (this.failOnError && monitor.hasError()) {
545               monitorThread.interrupt();
546               if (monitor.initialized) {
547                 System.exit(monitor.errorCode);
548               } else {
549                 System.exit(INIT_ERROR_EXIT_CODE);
550               }
551             }
552             currentTimeLength = System.currentTimeMillis() - startTime;
553             if (currentTimeLength > this.timeout) {
554               LOG.error("The monitor is running too long (" + currentTimeLength
555                   + ") after timeout limit:" + this.timeout
556                   + " will be killed itself !!");
557               if (monitor.initialized) {
558                 System.exit(TIMEOUT_ERROR_EXIT_CODE);
559               } else {
560                 System.exit(INIT_ERROR_EXIT_CODE);
561               }
562               break;
563             }
564           }
565 
566           if (this.failOnError && monitor.hasError()) {
567             monitorThread.interrupt();
568             System.exit(monitor.errorCode);
569           }
570         } finally {
571           if (monitor != null) monitor.close();
572         }
573 
574         Thread.sleep(interval);
575       } while (interval > 0);
576     } finally {
577       connection.close();
578     }
579 
580     return(monitor.errorCode);
581   }
582 
583   private void printUsageAndExit() {
584     System.err.printf(
585       "Usage: bin/hbase %s [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]%n",
586         getClass().getName());
587     System.err.println(" where [opts] are:");
588     System.err.println("   -help          Show this help and exit.");
589     System.err.println("   -regionserver  replace the table argument to regionserver,");
590     System.err.println("      which means to enable regionserver mode");
591     System.err.println("   -daemon        Continuous check at defined intervals.");
592     System.err.println("   -interval <N>  Interval between checks (sec)");
593     System.err.println("   -e             Use region/regionserver as regular expression");
594     System.err.println("      which means the region/regionserver is regular expression pattern");
595     System.err.println("   -f <B>         stop whole program if first error occurs," +
596         " default is true");
597     System.err.println("   -t <N>         timeout for a check, default is 600000 (milisecs)");
598     System.err.println("   -writeSniffing enable the write sniffing in canary");
599     System.err.println("   -writeTable    The table used for write sniffing."
600         + " Default is hbase:canary");
601     System.exit(USAGE_EXIT_CODE);
602   }
603 
604   /**
605    * A Factory method for {@link Monitor}.
606    * Can be overridden by user.
607    * @param index a start index for monitor target
608    * @param args args passed from user
609    * @return a Monitor instance
610    */
611   public Monitor newMonitor(final HConnection connection, int index, String[] args) {
612     Monitor monitor = null;
613     String[] monitorTargets = null;
614 
615     if(index >= 0) {
616       int length = args.length - index;
617       monitorTargets = new String[length];
618       System.arraycopy(args, index, monitorTargets, 0, length);
619     }
620 
621     if (this.regionServerMode) {
622       monitor =
623           new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
624               (ExtendedSink) this.sink, this.executor);
625     } else {
626       monitor =
627           new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor,
628               this.writeSniffing, this.writeTableName);
629     }
630     return monitor;
631   }
632 
633   // a Monitor super-class can be extended by users
634   public static abstract class Monitor implements Runnable, Closeable {
635 
636     protected HConnection connection;
637     protected HBaseAdmin admin;
638     protected String[] targets;
639     protected boolean useRegExp;
640     protected boolean initialized = false;
641 
642     protected boolean done = false;
643     protected int errorCode = 0;
644     protected Sink sink;
645     protected ExecutorService executor;
646 
647     public boolean isDone() {
648       return done;
649     }
650 
651     public boolean hasError() {
652       return errorCode != 0;
653     }
654 
655     @Override
656     public void close() throws IOException {
657       if (this.admin != null) this.admin.close();
658     }
659 
660     protected Monitor(HConnection connection, String[] monitorTargets, boolean useRegExp, Sink sink,
661         ExecutorService executor) {
662       if (null == connection) throw new IllegalArgumentException("connection shall not be null");
663 
664       this.connection = connection;
665       this.targets = monitorTargets;
666       this.useRegExp = useRegExp;
667       this.sink = sink;
668       this.executor = executor;
669     }
670 
671     public abstract void run();
672 
673     protected boolean initAdmin() {
674       if (null == this.admin) {
675         try {
676           this.admin = new HBaseAdmin(connection);
677         } catch (Exception e) {
678           LOG.error("Initial HBaseAdmin failed...", e);
679           this.errorCode = INIT_ERROR_EXIT_CODE;
680         }
681       } else if (admin.isAborted()) {
682         LOG.error("HBaseAdmin aborted");
683         this.errorCode = INIT_ERROR_EXIT_CODE;
684       }
685       return !this.hasError();
686     }
687   }
688 
689   // a monitor for region mode
690   private static class RegionMonitor extends Monitor {
691     // 10 minutes
692     private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
693     // 1 days
694     private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
695 
696     private long lastCheckTime = -1;
697     private boolean writeSniffing;
698     private TableName writeTableName;
699     private int writeDataTTL;
700     private float regionsLowerLimit;
701     private float regionsUpperLimit;
702     private int checkPeriod;
703 
704     public RegionMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
705         Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName) {
706       super(connection, monitorTargets, useRegExp, sink, executor);
707       Configuration conf = connection.getConfiguration();
708       this.writeSniffing = writeSniffing;
709       this.writeTableName = writeTableName;
710       this.writeDataTTL =
711           conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
712       this.regionsLowerLimit =
713           conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
714       this.regionsUpperLimit =
715           conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
716       this.checkPeriod =
717           conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
718             DEFAULT_WRITE_TABLE_CHECK_PERIOD);
719     }
720 
721     @Override
722     public void run() {
723       if (this.initAdmin()) {
724         try {
725           List<Future<Void>> taskFutures = new LinkedList<Future<Void>>();
726           if (this.targets != null && this.targets.length > 0) {
727             String[] tables = generateMonitorTables(this.targets);
728             this.initialized = true;
729             for (String table : tables) {
730               taskFutures.addAll(Canary.sniff(connection, sink, table, executor, TaskType.READ));
731             }
732           } else {
733             taskFutures.addAll(sniff(TaskType.READ));
734           }
735 
736           if (writeSniffing) {
737             if (EnvironmentEdgeManager.currentTimeMillis() - lastCheckTime > checkPeriod) {
738               try {
739                 checkWriteTableDistribution();
740               } catch (IOException e) {
741                 LOG.error("Check canary table distribution failed!", e);
742               }
743               lastCheckTime = EnvironmentEdgeManager.currentTimeMillis();
744             }
745             // sniff canary table with write operation
746             taskFutures.addAll(Canary.sniff(connection, sink,
747               writeTableName.getNameAsString(), executor, TaskType.WRITE));
748           }
749 
750           for (Future<Void> future : taskFutures) {
751             try {
752               future.get();
753             } catch (ExecutionException e) {
754               LOG.error("Sniff region failed!", e);
755             }
756           }
757         } catch (Exception e) {
758           LOG.error("Run regionMonitor failed", e);
759           this.errorCode = ERROR_EXIT_CODE;
760         }
761       }
762       this.done = true;
763     }
764 
765     private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
766       String[] returnTables = null;
767 
768       if (this.useRegExp) {
769         Pattern pattern = null;
770         HTableDescriptor[] tds = null;
771         Set<String> tmpTables = new TreeSet<String>();
772         try {
773           for (String monitorTarget : monitorTargets) {
774             pattern = Pattern.compile(monitorTarget);
775             tds = this.admin.listTables(pattern);
776             if (tds != null) {
777               for (HTableDescriptor td : tds) {
778                 tmpTables.add(td.getNameAsString());
779               }
780             }
781           }
782         } catch (IOException e) {
783           LOG.error("Communicate with admin failed", e);
784           throw e;
785         }
786 
787         if (tmpTables.size() > 0) {
788           returnTables = tmpTables.toArray(new String[tmpTables.size()]);
789         } else {
790           String msg = "No HTable found, tablePattern:" + Arrays.toString(monitorTargets);
791           LOG.error(msg);
792           this.errorCode = INIT_ERROR_EXIT_CODE;
793           throw new TableNotFoundException(msg);
794         }
795       } else {
796         returnTables = monitorTargets;
797       }
798 
799       return returnTables;
800     }
801 
802     /*
803      * canary entry point to monitor all the tables.
804      */
805     private List<Future<Void>> sniff(TaskType taskType) throws Exception {
806       List<Future<Void>> taskFutures = new LinkedList<Future<Void>>();
807       for (HTableDescriptor table : admin.listTables()) {
808         if (admin.isTableEnabled(table.getTableName())
809             && (!table.getTableName().equals(writeTableName))) {
810           taskFutures.addAll(Canary.sniff(connection, sink, table.getTableName(), executor,
811             taskType));
812         }
813       }
814       return taskFutures;
815     }
816 
817     private void checkWriteTableDistribution() throws IOException, ServiceException {
818       if (!admin.tableExists(writeTableName)) {
819         int numberOfServers = admin.getClusterStatus().getServers().size();
820         if (numberOfServers == 0) {
821           throw new IllegalStateException("No live regionservers");
822         }
823         createWriteTable(numberOfServers);
824       }
825 
826       if (!admin.isTableEnabled(writeTableName)) {
827         admin.enableTable(writeTableName);
828       }
829 
830       int numberOfServers = admin.getClusterStatus().getServers().size();
831       List<HRegionLocation> locations = connection.locateRegions(writeTableName);
832       int numberOfRegions = locations.size();
833       if (numberOfRegions < numberOfServers * regionsLowerLimit
834           || numberOfRegions > numberOfServers * regionsUpperLimit) {
835         admin.disableTable(writeTableName);
836         admin.deleteTable(writeTableName);
837         createWriteTable(numberOfServers);
838       }
839       HashSet<ServerName> serverSet = new HashSet<ServerName>();
840       for (HRegionLocation location: locations) {
841         serverSet.add(location.getServerName());
842       }
843       int numberOfCoveredServers = serverSet.size();
844       if (numberOfCoveredServers < numberOfServers) {
845         admin.balancer();
846       }
847     }
848 
849     private void createWriteTable(int numberOfServers) throws IOException {
850       int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
851       LOG.info("Number of live regionservers: " + numberOfServers + ", "
852           + "pre-splitting the canary table into " + numberOfRegions + " regions "
853           + "(current  lower limi of regions per server is " + regionsLowerLimit
854           + " and you can change it by config: "
855           + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
856       HTableDescriptor desc = new HTableDescriptor(writeTableName);
857       HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
858       family.setMaxVersions(1);
859       family.setTimeToLive(writeDataTTL);
860 
861       desc.addFamily(family);
862       byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
863       admin.createTable(desc, splits);
864     }
865   }
866 
867   /**
868    * Canary entry point for specified table.
869    * @throws Exception
870    */
871   public static void sniff(final HConnection connection, TableName tableName, TaskType taskType)
872       throws Exception {
873     List<Future<Void>> taskFutures =
874         Canary.sniff(connection, new StdOutSink(), tableName.getNameAsString(),
875           new ScheduledThreadPoolExecutor(1), taskType);
876     for (Future<Void> future : taskFutures) {
877       future.get();
878     }
879   }
880 
881   /**
882    * Canary entry point for specified table.
883    * @throws Exception
884    */
885   private static List<Future<Void>> sniff(final HConnection connection, final Sink sink,
886     String tableName, ExecutorService executor, TaskType taskType) throws Exception {
887     HBaseAdmin admin = new HBaseAdmin(connection);
888     try {
889       if (admin.isTableEnabled(TableName.valueOf(tableName))) {
890         return Canary.sniff(connection, sink, TableName.valueOf(tableName), executor,
891           taskType);
892       } else {
893         LOG.warn(String.format("Table %s is not enabled", tableName));
894       }
895       return new LinkedList<Future<Void>>();
896     } finally {
897       admin.close();
898     }
899   }
900 
901   /*
902    * Loops over regions that owns this table, and output some information abouts the state.
903    */
904   private static List<Future<Void>> sniff(final HConnection connection, final Sink sink,
905       TableName tableName, ExecutorService executor, TaskType taskType) throws Exception {
906     HTableInterface table = null;
907     try {
908       table = connection.getTable(tableName);
909     } catch (TableNotFoundException e) {
910       return new ArrayList<Future<Void>>();
911     }
912     List<RegionTask> tasks = new ArrayList<RegionTask>();
913     try {
914       for (HRegionInfo region : ((HTable)table).getRegionLocations().keySet()) {
915         tasks.add(new RegionTask(connection, region, sink, taskType));
916       }
917     } finally {
918       table.close();
919     }
920     return executor.invokeAll(tasks);
921   }
922 
923   /*
924    * For each column family of the region tries to get one row and outputs the latency, or the
925    * failure.
926    */
927   private static void sniffRegion(
928       final HBaseAdmin admin,
929       final Sink sink,
930       HRegionInfo region,
931       HTableInterface table) throws Exception {
932     HTableDescriptor tableDesc = table.getTableDescriptor();
933     byte[] startKey = null;
934     Get get = null;
935     Scan scan = null;
936     ResultScanner rs = null;
937     StopWatch stopWatch = new StopWatch();
938     for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
939       stopWatch.reset();
940       startKey = region.getStartKey();
941       // Can't do a get on empty start row so do a Scan of first element if any instead.
942       if (startKey.length > 0) {
943         get = new Get(startKey);
944         get.setCacheBlocks(false);
945         get.setFilter(new FirstKeyOnlyFilter());
946         get.addFamily(column.getName());
947       } else {
948         scan = new Scan();
949         scan.setRaw(true);
950         scan.setCaching(1);
951         scan.setCacheBlocks(false);
952         scan.setFilter(new FirstKeyOnlyFilter());
953         scan.addFamily(column.getName());
954         scan.setMaxResultSize(1L);
955       }
956 
957       try {
958         if (startKey.length > 0) {
959           stopWatch.start();
960           table.get(get);
961           stopWatch.stop();
962           sink.publishReadTiming(region, column, stopWatch.getTime());
963         } else {
964           stopWatch.start();
965           rs = table.getScanner(scan);
966           stopWatch.stop();
967           sink.publishReadTiming(region, column, stopWatch.getTime());
968         }
969       } catch (Exception e) {
970         sink.publishReadFailure(region, column, e);
971       } finally {
972         if (rs != null) {
973           rs.close();
974         }
975         scan = null;
976         get = null;
977         startKey = null;
978       }
979     }
980   }
981   // a monitor for regionserver mode
982   private static class RegionServerMonitor extends Monitor {
983 
984     public RegionServerMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
985         ExtendedSink sink, ExecutorService executor) {
986       super(connection, monitorTargets, useRegExp, sink, executor);
987     }
988 
989     private ExtendedSink getSink() {
990       return (ExtendedSink) this.sink;
991     }
992 
993     @Override
994     public void run() {
995       if (this.initAdmin() && this.checkNoTableNames()) {
996         Map<String, List<HRegionInfo>> rsAndRMap = this.filterRegionServerByName();
997         this.initialized = true;
998         this.monitorRegionServers(rsAndRMap);
999       }
1000       this.done = true;
1001     }
1002 
1003     private boolean checkNoTableNames() {
1004       List<String> foundTableNames = new ArrayList<String>();
1005       TableName[] tableNames = null;
1006 
1007       try {
1008         tableNames = this.admin.listTableNames();
1009       } catch (IOException e) {
1010         LOG.error("Get listTableNames failed", e);
1011         this.errorCode = INIT_ERROR_EXIT_CODE;
1012         return false;
1013       }
1014 
1015       if (this.targets == null || this.targets.length == 0) return true;
1016 
1017       for (String target : this.targets) {
1018         for (TableName tableName : tableNames) {
1019           if (target.equals(tableName.getNameAsString())) {
1020             foundTableNames.add(target);
1021           }
1022         }
1023       }
1024 
1025       if (foundTableNames.size() > 0) {
1026         System.err.println("Cannot pass a tablename when using the -regionserver " +
1027             "option, tablenames:" + foundTableNames.toString());
1028         this.errorCode = USAGE_EXIT_CODE;
1029       }
1030       return foundTableNames.size() == 0;
1031     }
1032 
1033     private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap) {
1034       List<RegionServerTask> tasks = new ArrayList<RegionServerTask>();
1035       Random rand =new Random();
1036       // monitor one region on every region server
1037       for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
1038         String serverName = entry.getKey();
1039         // random select a region
1040         HRegionInfo region = entry.getValue().get(rand.nextInt(entry.getValue().size()));
1041         tasks.add(new RegionServerTask(this.connection, serverName, region, getSink()));
1042       }
1043       try {
1044         for (Future<Void> future : this.executor.invokeAll(tasks)) {
1045           try {
1046             future.get();
1047           } catch (ExecutionException e) {
1048             LOG.error("Sniff regionserver failed!", e);
1049             this.errorCode = ERROR_EXIT_CODE;
1050           }
1051         }
1052       } catch (InterruptedException e) {
1053         this.errorCode = ERROR_EXIT_CODE;
1054         LOG.error("Sniff regionserver failed!", e);
1055       }
1056     }
1057 
1058     private Map<String, List<HRegionInfo>> filterRegionServerByName() {
1059       Map<String, List<HRegionInfo>> regionServerAndRegionsMap = this.getAllRegionServerByName();
1060       regionServerAndRegionsMap = this.doFilterRegionServerByName(regionServerAndRegionsMap);
1061       return regionServerAndRegionsMap;
1062     }
1063 
1064     private Map<String, List<HRegionInfo>> getAllRegionServerByName() {
1065       Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<String, List<HRegionInfo>>();
1066       HTableInterface table = null;
1067       try {
1068         HTableDescriptor[] tableDescs = this.admin.listTables();
1069         List<HRegionInfo> regions = null;
1070         for (HTableDescriptor tableDesc : tableDescs) {
1071           table = this.admin.getConnection().getTable(tableDesc.getTableName());
1072           for (Entry<HRegionInfo, ServerName> e: ((HTable)table).getRegionLocations().entrySet()) {
1073             HRegionInfo r = e.getKey();
1074             ServerName rs = e.getValue();
1075             String rsName = rs.getHostname();
1076 
1077             if (rsAndRMap.containsKey(rsName)) {
1078               regions = rsAndRMap.get(rsName);
1079             } else {
1080               regions = new ArrayList<HRegionInfo>();
1081               rsAndRMap.put(rsName, regions);
1082             }
1083             regions.add(r);
1084           }
1085           table.close();
1086         }
1087 
1088       } catch (IOException e) {
1089         String msg = "Get HTables info failed";
1090         LOG.error(msg, e);
1091         this.errorCode = INIT_ERROR_EXIT_CODE;
1092       } finally {
1093         if (table != null) {
1094           try {
1095             table.close();
1096           } catch (IOException e) {
1097             LOG.warn("Close table failed", e);
1098           }
1099         }
1100       }
1101 
1102       return rsAndRMap;
1103     }
1104 
1105     private Map<String, List<HRegionInfo>> doFilterRegionServerByName(
1106         Map<String, List<HRegionInfo>> fullRsAndRMap) {
1107 
1108       Map<String, List<HRegionInfo>> filteredRsAndRMap = null;
1109 
1110       if (this.targets != null && this.targets.length > 0) {
1111         filteredRsAndRMap = new HashMap<String, List<HRegionInfo>>();
1112         Pattern pattern = null;
1113         Matcher matcher = null;
1114         boolean regExpFound = false;
1115         for (String rsName : this.targets) {
1116           if (this.useRegExp) {
1117             regExpFound = false;
1118             pattern = Pattern.compile(rsName);
1119             for (Map.Entry<String, List<HRegionInfo>> entry : fullRsAndRMap.entrySet()) {
1120               matcher = pattern.matcher(entry.getKey());
1121               if (matcher.matches()) {
1122                 filteredRsAndRMap.put(entry.getKey(), entry.getValue());
1123                 regExpFound = true;
1124               }
1125             }
1126             if (!regExpFound) {
1127               LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName);
1128             }
1129           } else {
1130             if (fullRsAndRMap.containsKey(rsName)) {
1131               filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
1132             } else {
1133               LOG.info("No RegionServerInfo found, regionServerName:" + rsName);
1134             }
1135           }
1136         }
1137       } else {
1138         filteredRsAndRMap = fullRsAndRMap;
1139       }
1140       return filteredRsAndRMap;
1141     }
1142   }
1143 
1144   public static void main(String[] args) throws Exception {
1145     final Configuration conf = HBaseConfiguration.create();
1146     AuthUtil.launchAuthChore(conf);
1147     int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
1148     ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
1149 
1150     Class<? extends Sink> sinkClass =
1151         conf.getClass("hbase.canary.sink.class", StdOutSink.class, Sink.class);
1152     Sink sink = ReflectionUtils.newInstance(sinkClass);
1153 
1154     int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
1155     executor.shutdown();
1156     System.exit(exitCode);
1157   }
1158 }