View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.tool;
21  
22  import java.io.Closeable;
23  import java.io.IOException;
24  import java.util.ArrayList;
25  import java.util.Arrays;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.LinkedList;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.Map.Entry;
32  import java.util.Random;
33  import java.util.Set;
34  import java.util.TreeSet;
35  import java.util.concurrent.Callable;
36  import java.util.concurrent.ExecutionException;
37  import java.util.concurrent.ExecutorService;
38  import java.util.concurrent.Future;
39  import java.util.concurrent.ScheduledThreadPoolExecutor;
40  import java.util.concurrent.atomic.AtomicLong;
41  import java.util.regex.Matcher;
42  import java.util.regex.Pattern;
43  
44  import org.apache.commons.lang.time.StopWatch;
45  import org.apache.commons.logging.Log;
46  import org.apache.commons.logging.LogFactory;
47  import org.apache.hadoop.conf.Configuration;
48  import org.apache.hadoop.hbase.AuthUtil;
49  import org.apache.hadoop.hbase.DoNotRetryIOException;
50  import org.apache.hadoop.hbase.HBaseConfiguration;
51  import org.apache.hadoop.hbase.HColumnDescriptor;
52  import org.apache.hadoop.hbase.HConstants;
53  import org.apache.hadoop.hbase.HRegionInfo;
54  import org.apache.hadoop.hbase.HRegionLocation;
55  import org.apache.hadoop.hbase.HTableDescriptor;
56  import org.apache.hadoop.hbase.NamespaceDescriptor;
57  import org.apache.hadoop.hbase.ServerName;
58  import org.apache.hadoop.hbase.TableName;
59  import org.apache.hadoop.hbase.TableNotEnabledException;
60  import org.apache.hadoop.hbase.TableNotFoundException;
61  import org.apache.hadoop.hbase.client.Get;
62  import org.apache.hadoop.hbase.client.HBaseAdmin;
63  import org.apache.hadoop.hbase.client.HConnection;
64  import org.apache.hadoop.hbase.client.HConnectionManager;
65  import org.apache.hadoop.hbase.client.HTable;
66  import org.apache.hadoop.hbase.client.HTableInterface;
67  import org.apache.hadoop.hbase.client.Put;
68  import org.apache.hadoop.hbase.client.ResultScanner;
69  import org.apache.hadoop.hbase.client.Scan;
70  import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
71  import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
72  import org.apache.hadoop.hbase.util.Bytes;
73  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
74  import org.apache.hadoop.hbase.util.ReflectionUtils;
75  import org.apache.hadoop.hbase.util.RegionSplitter;
76  import org.apache.hadoop.util.GenericOptionsParser;
77  import org.apache.hadoop.util.Tool;
78  import org.apache.hadoop.util.ToolRunner;
79  
80  import com.google.protobuf.ServiceException;
81  
82  /**
83   * HBase Canary Tool, that that can be used to do
84   * "canary monitoring" of a running HBase cluster.
85   *
86   * Here are two modes
87   * 1. region mode - Foreach region tries to get one row per column family
88   * and outputs some information about failure or latency.
89   *
90   * 2. regionserver mode - Foreach regionserver tries to get one row from one table
91   * selected randomly and outputs some information about failure or latency.
92   */
93  public final class Canary implements Tool {
94    // Sink interface used by the canary to outputs information
95    public interface Sink {
96      public long getReadFailureCount();
97      public long incReadFailureCount();
98      public void publishReadFailure(HRegionInfo region, Exception e);
99      public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
100     public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
101     public long getWriteFailureCount();
102     public void publishWriteFailure(HRegionInfo region, Exception e);
103     public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
104     public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
105   }
106   // new extended sink for output regionserver mode info
107   // do not change the Sink interface directly due to maintaining the API
108   public interface ExtendedSink extends Sink {
109     public void publishReadFailure(String table, String server);
110     public void publishReadTiming(String table, String server, long msTime);
111   }
112 
113   // Simple implementation of canary sink that allows to plot on
114   // file or standard output timings or failures.
115   public static class StdOutSink implements Sink {
116     private AtomicLong readFailureCount = new AtomicLong(0),
117         writeFailureCount = new AtomicLong(0);
118 
119     @Override
120     public long getReadFailureCount() {
121       return readFailureCount.get();
122     }
123 
124     @Override
125     public long incReadFailureCount() {
126       return readFailureCount.incrementAndGet();
127     }
128 
129     @Override
130     public void publishReadFailure(HRegionInfo region, Exception e) {
131       readFailureCount.incrementAndGet();
132       LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()), e);
133     }
134 
135     @Override
136     public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
137       readFailureCount.incrementAndGet();
138       LOG.error(String.format("read from region %s column family %s failed",
139                 region.getRegionNameAsString(), column.getNameAsString()), e);
140     }
141 
142     @Override
143     public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime) {
144       LOG.info(String.format("read from region %s column family %s in %dms",
145                region.getRegionNameAsString(), column.getNameAsString(), msTime));
146     }
147 
148     @Override
149     public long getWriteFailureCount() {
150       return writeFailureCount.get();
151     }
152 
153     @Override
154     public void publishWriteFailure(HRegionInfo region, Exception e) {
155       writeFailureCount.incrementAndGet();
156       LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()), e);
157     }
158 
159     @Override
160     public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
161       writeFailureCount.incrementAndGet();
162       LOG.error(String.format("write to region %s column family %s failed",
163         region.getRegionNameAsString(), column.getNameAsString()), e);
164     }
165 
166     @Override
167     public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime) {
168       LOG.info(String.format("write to region %s column family %s in %dms",
169         region.getRegionNameAsString(), column.getNameAsString(), msTime));
170     }
171   }
172   // a ExtendedSink implementation
173   public static class RegionServerStdOutSink extends StdOutSink implements ExtendedSink {
174 
175     @Override
176     public void publishReadFailure(String table, String server) {
177       incReadFailureCount();
178       LOG.error(String.format("Read from table:%s on region server:%s", table, server));
179     }
180 
181     @Override
182     public void publishReadTiming(String table, String server, long msTime) {
183       LOG.info(String.format("Read from table:%s on region server:%s in %dms",
184           table, server, msTime));
185     }
186   }
187 
188   /**
189    * For each column family of the region tries to get one row and outputs the latency, or the
190    * failure.
191    */
192   public static class RegionTask implements Callable<Void> {
193     public enum TaskType{
194       READ, WRITE
195     }
196     private HConnection connection;
197     private HRegionInfo region;
198     private Sink sink;
199     private TaskType taskType;
200 
201     RegionTask(HConnection connection, HRegionInfo region, Sink sink, TaskType taskType) {
202       this.connection = connection;
203       this.region = region;
204       this.sink = sink;
205       this.taskType = taskType;
206     }
207 
208     @Override
209     public Void call() {
210       switch (taskType) {
211       case READ:
212         return read();
213       case WRITE:
214         return write();
215       default:
216         return read();
217       }
218     }
219 
220     public Void read() {
221       HTableInterface table = null;
222       HTableDescriptor tableDesc = null;
223       try {
224         if (LOG.isDebugEnabled()) {
225           LOG.debug(String.format("reading table descriptor for table %s",
226             region.getTable()));
227         }
228         table = connection.getTable(region.getTable());
229         tableDesc = table.getTableDescriptor();
230       } catch (IOException e) {
231         LOG.debug("sniffRegion failed", e);
232         sink.publishReadFailure(region, e);
233         if (table != null) {
234           try {
235             table.close();
236           } catch (IOException ioe) {
237             LOG.error("Close table failed", e);
238           }
239         }
240         return null;
241       }
242 
243       byte[] startKey = null;
244       Get get = null;
245       Scan scan = null;
246       ResultScanner rs = null;
247       StopWatch stopWatch = new StopWatch();
248       for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
249         stopWatch.reset();
250         startKey = region.getStartKey();
251         // Can't do a get on empty start row so do a Scan of first element if any instead.
252         if (startKey.length > 0) {
253           get = new Get(startKey);
254           get.setCacheBlocks(false);
255           get.setFilter(new FirstKeyOnlyFilter());
256           get.addFamily(column.getName());
257         } else {
258           scan = new Scan();
259           scan.setCaching(1);
260           scan.setCacheBlocks(false);
261           scan.setFilter(new FirstKeyOnlyFilter());
262           scan.addFamily(column.getName());
263           scan.setMaxResultSize(1L);
264           scan.setSmall(true);
265         }
266 
267         if (LOG.isDebugEnabled()) {
268           LOG.debug(String.format("reading from table %s region %s column family %s and key %s",
269             tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
270             Bytes.toStringBinary(startKey)));
271         }
272         try {
273           stopWatch.start();
274           if (startKey.length > 0) {
275             table.get(get);
276           } else {
277             rs = table.getScanner(scan);
278             rs.next();
279           }
280           stopWatch.stop();
281           sink.publishReadTiming(region, column, stopWatch.getTime());
282         } catch (Exception e) {
283           sink.publishReadFailure(region, column, e);
284         } finally {
285           if (rs != null) {
286             rs.close();
287           }
288           scan = null;
289           get = null;
290           startKey = null;
291         }
292       }
293       try {
294         table.close();
295       } catch (IOException e) {
296         LOG.error("Close table failed", e);
297       }
298       return null;
299     }
300 
301     /**
302      * Check writes for the canary table
303      * @return
304      */
305     private Void write() {
306       HTableInterface table = null;
307       HTableDescriptor tableDesc = null;
308       try {
309         table = connection.getTable(region.getTable());
310         tableDesc = table.getTableDescriptor();
311         byte[] rowToCheck = region.getStartKey();
312         if (rowToCheck.length == 0) {
313           rowToCheck = new byte[]{0x0};
314         }
315         int writeValueSize =
316             connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY, 10);
317         for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
318           Put put = new Put(rowToCheck);
319           byte[] value = new byte[writeValueSize];
320           Bytes.random(value);
321           put.add(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
322 
323           if (LOG.isDebugEnabled()) {
324             LOG.debug(String.format("writing to table %s region %s column family %s and key %s",
325               tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
326               Bytes.toStringBinary(rowToCheck)));
327           }
328           try {
329             long startTime = System.currentTimeMillis();
330             table.put(put);
331             long time = System.currentTimeMillis() - startTime;
332             sink.publishWriteTiming(region, column, time);
333           } catch (Exception e) {
334             sink.publishWriteFailure(region, column, e);
335           }
336         }
337         table.close();
338       } catch (IOException e) {
339         sink.publishWriteFailure(region, e);
340       }
341       return null;
342     }
343   }
344 
345   /**
346    * Get one row from a region on the regionserver and outputs the latency, or the failure.
347    */
348   static class RegionServerTask implements Callable<Void> {
349     private HConnection connection;
350     private String serverName;
351     private HRegionInfo region;
352     private ExtendedSink sink;
353 
354     RegionServerTask(HConnection connection, String serverName, HRegionInfo region,
355         ExtendedSink sink) {
356       this.connection = connection;
357       this.serverName = serverName;
358       this.region = region;
359       this.sink = sink;
360     }
361 
362     @Override
363     public Void call() {
364       TableName tableName = null;
365       HTableInterface table = null;
366       Get get = null;
367       byte[] startKey = null;
368       Scan scan = null;
369       StopWatch stopWatch = new StopWatch();
370       // monitor one region on every region server
371       stopWatch.reset();
372       try {
373         tableName = region.getTable();
374         table = connection.getTable(tableName);
375         startKey = region.getStartKey();
376         // Can't do a get on empty start row so do a Scan of first element if any instead.
377         if (LOG.isDebugEnabled()) {
378           LOG.debug(String.format("reading from region server %s table %s region %s and key %s",
379             serverName, region.getTable(), region.getRegionNameAsString(),
380             Bytes.toStringBinary(startKey)));
381         }
382         if (startKey.length > 0) {
383           get = new Get(startKey);
384           get.setCacheBlocks(false);
385           get.setFilter(new FirstKeyOnlyFilter());
386           stopWatch.start();
387           table.get(get);
388           stopWatch.stop();
389         } else {
390           scan = new Scan();
391           scan.setCacheBlocks(false);
392           scan.setFilter(new FirstKeyOnlyFilter());
393           scan.setCaching(1);
394           scan.setMaxResultSize(1L);
395           scan.setSmall(true);
396           stopWatch.start();
397           ResultScanner s = table.getScanner(scan);
398           s.next();
399           s.close();
400           stopWatch.stop();
401         }
402         sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
403       } catch (TableNotFoundException tnfe) {
404         LOG.error("Table may be deleted", tnfe);
405         // This is ignored because it doesn't imply that the regionserver is dead
406       } catch (TableNotEnabledException tnee) {
407         // This is considered a success since we got a response.
408         LOG.debug("The targeted table was disabled.  Assuming success.");
409       } catch (DoNotRetryIOException dnrioe) {
410         sink.publishReadFailure(tableName.getNameAsString(), serverName);
411         LOG.error(dnrioe);
412       } catch (IOException e) {
413         sink.publishReadFailure(tableName.getNameAsString(), serverName);
414         LOG.error(e);
415       } finally {
416         if (table != null) {
417           try {
418             table.close();
419           } catch (IOException e) {/* DO NOTHING */
420             LOG.error("Close table failed", e);
421           }
422         }
423         scan = null;
424         get = null;
425         startKey = null;
426       }
427       return null;
428     }
429   }
430 
431   private static final int USAGE_EXIT_CODE = 1;
432   private static final int INIT_ERROR_EXIT_CODE = 2;
433   private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
434   private static final int ERROR_EXIT_CODE = 4;
435   private static final int FAILURE_EXIT_CODE = 5;
436 
437   private static final long DEFAULT_INTERVAL = 6000;
438 
439   private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
440   private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
441 
442   private static final Log LOG = LogFactory.getLog(Canary.class);
443 
444   public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
445     NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
446 
447   private static final String CANARY_TABLE_FAMILY_NAME = "Test";
448 
449   private Configuration conf = null;
450   private long interval = 0;
451   private Sink sink = null;
452 
453   private boolean useRegExp;
454   private long timeout = DEFAULT_TIMEOUT;
455   private boolean failOnError = true;
456   private boolean regionServerMode = false;
457   private boolean writeSniffing = false;
458   private boolean treatFailureAsError = false;
459   private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
460 
461   private ExecutorService executor; // threads to retrieve data from regionservers
462 
463   public Canary() {
464     this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink());
465   }
466 
467   public Canary(ExecutorService executor, Sink sink) {
468     this.executor = executor;
469     this.sink = sink;
470   }
471 
472   @Override
473   public Configuration getConf() {
474     return conf;
475   }
476 
477   @Override
478   public void setConf(Configuration conf) {
479     this.conf = conf;
480   }
481 
482   private int parseArgs(String[] args) {
483     int index = -1;
484     // Process command line args
485     for (int i = 0; i < args.length; i++) {
486       String cmd = args[i];
487 
488       if (cmd.startsWith("-")) {
489         if (index >= 0) {
490           // command line args must be in the form: [opts] [table 1 [table 2 ...]]
491           System.err.println("Invalid command line options");
492           printUsageAndExit();
493         }
494 
495         if (cmd.equals("-help")) {
496           // user asked for help, print the help and quit.
497           printUsageAndExit();
498         } else if (cmd.equals("-daemon") && interval == 0) {
499           // user asked for daemon mode, set a default interval between checks
500           interval = DEFAULT_INTERVAL;
501         } else if (cmd.equals("-interval")) {
502           // user has specified an interval for canary breaths (-interval N)
503           i++;
504 
505           if (i == args.length) {
506             System.err.println("-interval needs a numeric value argument.");
507             printUsageAndExit();
508           }
509 
510           try {
511             interval = Long.parseLong(args[i]) * 1000;
512           } catch (NumberFormatException e) {
513             System.err.println("-interval needs a numeric value argument.");
514             printUsageAndExit();
515           }
516         } else if(cmd.equals("-regionserver")) {
517           this.regionServerMode = true;
518         } else if(cmd.equals("-writeSniffing")) {
519           this.writeSniffing = true;
520         } else if(cmd.equals("-treatFailureAsError")) {
521           this.treatFailureAsError = true;
522         } else if (cmd.equals("-e")) {
523           this.useRegExp = true;
524         } else if (cmd.equals("-t")) {
525           i++;
526 
527           if (i == args.length) {
528             System.err.println("-t needs a numeric value argument.");
529             printUsageAndExit();
530           }
531 
532           try {
533             this.timeout = Long.parseLong(args[i]);
534           } catch (NumberFormatException e) {
535             System.err.println("-t needs a numeric value argument.");
536             printUsageAndExit();
537           }
538         } else if (cmd.equals("-writeTable")) {
539           i++;
540 
541           if (i == args.length) {
542             System.err.println("-writeTable needs a string value argument.");
543             printUsageAndExit();
544           }
545           this.writeTableName = TableName.valueOf(args[i]);
546         } else if (cmd.equals("-f")) {
547           i++;
548 
549           if (i == args.length) {
550             System.err
551                 .println("-f needs a boolean value argument (true|false).");
552             printUsageAndExit();
553           }
554 
555           this.failOnError = Boolean.parseBoolean(args[i]);
556         } else {
557           // no options match
558           System.err.println(cmd + " options is invalid.");
559           printUsageAndExit();
560         }
561       } else if (index < 0) {
562         // keep track of first table name specified by the user
563         index = i;
564       }
565     }
566     return index;
567   }
568 
569   @Override
570   public int run(String[] args) throws Exception {
571     int index = parseArgs(args);
572 
573     // Launches chore for refreshing kerberos credentials if security is enabled.
574     // Please see http://hbase.apache.org/book.html#_running_canary_in_a_kerberos_enabled_cluster
575     // for more details.
576     AuthUtil.launchAuthChore(conf);
577 
578     // Start to prepare the stuffs
579     Monitor monitor = null;
580     Thread monitorThread = null;
581     long startTime = 0;
582     long currentTimeLength = 0;
583     // Get a connection to use in below.
584     HConnection connection = HConnectionManager.createConnection(this.conf);
585     try {
586       do {
587         // Do monitor !!
588         try {
589           monitor = this.newMonitor(connection, index, args);
590           monitorThread = new Thread(monitor);
591           startTime = System.currentTimeMillis();
592           monitorThread.start();
593           while (!monitor.isDone()) {
594             // wait for 1 sec
595             Thread.sleep(1000);
596             // exit if any error occurs
597             if (this.failOnError && monitor.hasError()) {
598               monitorThread.interrupt();
599               if (monitor.initialized) {
600                 return monitor.errorCode;
601               } else {
602                 return INIT_ERROR_EXIT_CODE;
603               }
604             }
605             currentTimeLength = System.currentTimeMillis() - startTime;
606             if (currentTimeLength > this.timeout) {
607               LOG.error("The monitor is running too long (" + currentTimeLength
608                   + ") after timeout limit:" + this.timeout
609                   + " will be killed itself !!");
610               if (monitor.initialized) {
611                 return TIMEOUT_ERROR_EXIT_CODE;
612               } else {
613                 return INIT_ERROR_EXIT_CODE;
614               }
615             }
616           }
617 
618           if (this.failOnError && monitor.finalCheckForErrors()) {
619             monitorThread.interrupt();
620             return monitor.errorCode;
621           }
622         } finally {
623           if (monitor != null) monitor.close();
624         }
625 
626         Thread.sleep(interval);
627       } while (interval > 0);
628     } finally {
629       connection.close();
630     }
631 
632     return monitor.errorCode;
633   }
634 
635   private void printUsageAndExit() {
636     System.err.printf(
637       "Usage: bin/hbase %s [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]%n",
638         getClass().getName());
639     System.err.println(" where [opts] are:");
640     System.err.println("   -help          Show this help and exit.");
641     System.err.println("   -regionserver  replace the table argument to regionserver,");
642     System.err.println("      which means to enable regionserver mode");
643     System.err.println("   -daemon        Continuous check at defined intervals.");
644     System.err.println("   -interval <N>  Interval between checks (sec)");
645     System.err.println("   -e             Use table/regionserver as regular expression");
646     System.err.println("      which means the table/regionserver is regular expression pattern");
647     System.err.println("   -f <B>         stop whole program if first error occurs," +
648         " default is true");
649     System.err.println("   -t <N>         timeout for a check, default is 600000 (milisecs)");
650     System.err.println("   -writeSniffing enable the write sniffing in canary");
651     System.err.println("   -treatFailureAsError treats read / write failure as error");
652     System.err.println("   -writeTable    The table used for write sniffing."
653         + " Default is hbase:canary");
654     System.err
655         .println("   -D<configProperty>=<value> assigning or override the configuration params");
656     System.exit(USAGE_EXIT_CODE);
657   }
658 
659   /**
660    * A Factory method for {@link Monitor}.
661    * Can be overridden by user.
662    * @param index a start index for monitor target
663    * @param args args passed from user
664    * @return a Monitor instance
665    */
666   public Monitor newMonitor(final HConnection connection, int index, String[] args) {
667     Monitor monitor = null;
668     String[] monitorTargets = null;
669 
670     if(index >= 0) {
671       int length = args.length - index;
672       monitorTargets = new String[length];
673       System.arraycopy(args, index, monitorTargets, 0, length);
674     }
675 
676     if (this.regionServerMode) {
677       monitor =
678           new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
679               (ExtendedSink) this.sink, this.executor, this.treatFailureAsError);
680     } else {
681       monitor =
682           new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor,
683               this.writeSniffing, this.writeTableName, this.treatFailureAsError);
684     }
685     return monitor;
686   }
687 
688   // a Monitor super-class can be extended by users
689   public static abstract class Monitor implements Runnable, Closeable {
690 
691     protected HConnection connection;
692     protected HBaseAdmin admin;
693     protected String[] targets;
694     protected boolean useRegExp;
695     protected boolean treatFailureAsError;
696     protected boolean initialized = false;
697 
698     protected boolean done = false;
699     protected int errorCode = 0;
700     protected Sink sink;
701     protected ExecutorService executor;
702 
703     public boolean isDone() {
704       return done;
705     }
706 
707     public boolean hasError() {
708       return errorCode != 0;
709     }
710 
711     public boolean finalCheckForErrors() {
712       if (errorCode != 0) {
713         return true;
714       }
715       if (treatFailureAsError &&
716           (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0)) {
717         errorCode = FAILURE_EXIT_CODE;
718         return true;
719       }
720       return false;
721     }
722 
723     @Override
724     public void close() throws IOException {
725       if (this.admin != null) this.admin.close();
726     }
727 
728     protected Monitor(HConnection connection, String[] monitorTargets, boolean useRegExp, Sink sink,
729         ExecutorService executor, boolean treatFailureAsError) {
730       if (null == connection) throw new IllegalArgumentException("connection shall not be null");
731 
732       this.connection = connection;
733       this.targets = monitorTargets;
734       this.useRegExp = useRegExp;
735       this.sink = sink;
736       this.executor = executor;
737     }
738 
739     @Override
740     public abstract void run();
741 
742     protected boolean initAdmin() {
743       if (null == this.admin) {
744         try {
745           this.admin = new HBaseAdmin(connection);
746         } catch (Exception e) {
747           LOG.error("Initial HBaseAdmin failed...", e);
748           this.errorCode = INIT_ERROR_EXIT_CODE;
749         }
750       } else if (admin.isAborted()) {
751         LOG.error("HBaseAdmin aborted");
752         this.errorCode = INIT_ERROR_EXIT_CODE;
753       }
754       return !this.hasError();
755     }
756   }
757 
758   // a monitor for region mode
759   private static class RegionMonitor extends Monitor {
760     // 10 minutes
761     private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
762     // 1 days
763     private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
764 
765     private long lastCheckTime = -1;
766     private boolean writeSniffing;
767     private TableName writeTableName;
768     private int writeDataTTL;
769     private float regionsLowerLimit;
770     private float regionsUpperLimit;
771     private int checkPeriod;
772 
773     public RegionMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
774         Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
775         boolean treatFailureAsError) {
776       super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError);
777       Configuration conf = connection.getConfiguration();
778       this.writeSniffing = writeSniffing;
779       this.writeTableName = writeTableName;
780       this.writeDataTTL =
781           conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
782       this.regionsLowerLimit =
783           conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
784       this.regionsUpperLimit =
785           conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
786       this.checkPeriod =
787           conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
788             DEFAULT_WRITE_TABLE_CHECK_PERIOD);
789     }
790 
791     @Override
792     public void run() {
793       if (this.initAdmin()) {
794         try {
795           List<Future<Void>> taskFutures = new LinkedList<Future<Void>>();
796           if (this.targets != null && this.targets.length > 0) {
797             String[] tables = generateMonitorTables(this.targets);
798             this.initialized = true;
799             for (String table : tables) {
800               taskFutures.addAll(Canary.sniff(connection, sink, table, executor, TaskType.READ));
801             }
802           } else {
803             taskFutures.addAll(sniff(TaskType.READ));
804           }
805 
806           if (writeSniffing) {
807             if (EnvironmentEdgeManager.currentTimeMillis() - lastCheckTime > checkPeriod) {
808               try {
809                 checkWriteTableDistribution();
810               } catch (IOException e) {
811                 LOG.error("Check canary table distribution failed!", e);
812               }
813               lastCheckTime = EnvironmentEdgeManager.currentTimeMillis();
814             }
815             // sniff canary table with write operation
816             taskFutures.addAll(Canary.sniff(connection, sink,
817               writeTableName.getNameAsString(), executor, TaskType.WRITE));
818           }
819 
820           for (Future<Void> future : taskFutures) {
821             try {
822               future.get();
823             } catch (ExecutionException e) {
824               LOG.error("Sniff region failed!", e);
825             }
826           }
827         } catch (Exception e) {
828           LOG.error("Run regionMonitor failed", e);
829           this.errorCode = ERROR_EXIT_CODE;
830         }
831       }
832       this.done = true;
833     }
834 
835     private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
836       String[] returnTables = null;
837 
838       if (this.useRegExp) {
839         Pattern pattern = null;
840         HTableDescriptor[] tds = null;
841         Set<String> tmpTables = new TreeSet<String>();
842         try {
843           if (LOG.isDebugEnabled()) {
844             LOG.debug(String.format("reading list of tables"));
845           }
846           tds = this.admin.listTables(pattern);
847           if (tds == null) {
848             tds = new HTableDescriptor[0];
849           }
850           for (String monitorTarget : monitorTargets) {
851             pattern = Pattern.compile(monitorTarget);
852             for (HTableDescriptor td : tds) {
853               if (pattern.matcher(td.getNameAsString()).matches()) {
854                 tmpTables.add(td.getNameAsString());
855               }
856             }
857           }
858         } catch (IOException e) {
859           LOG.error("Communicate with admin failed", e);
860           throw e;
861         }
862 
863         if (tmpTables.size() > 0) {
864           returnTables = tmpTables.toArray(new String[tmpTables.size()]);
865         } else {
866           String msg = "No HTable found, tablePattern:" + Arrays.toString(monitorTargets);
867           LOG.error(msg);
868           this.errorCode = INIT_ERROR_EXIT_CODE;
869           throw new TableNotFoundException(msg);
870         }
871       } else {
872         returnTables = monitorTargets;
873       }
874 
875       return returnTables;
876     }
877 
878     /*
879      * canary entry point to monitor all the tables.
880      */
881     private List<Future<Void>> sniff(TaskType taskType) throws Exception {
882       if (LOG.isDebugEnabled()) {
883         LOG.debug(String.format("reading list of tables"));
884       }
885       List<Future<Void>> taskFutures = new LinkedList<Future<Void>>();
886       for (HTableDescriptor table : admin.listTables()) {
887         if (admin.isTableEnabled(table.getTableName())
888             && (!table.getTableName().equals(writeTableName))) {
889           taskFutures.addAll(Canary.sniff(connection, sink, table.getTableName(), executor,
890             taskType));
891         }
892       }
893       return taskFutures;
894     }
895 
896     private void checkWriteTableDistribution() throws IOException, ServiceException {
897       if (!admin.tableExists(writeTableName)) {
898         int numberOfServers = admin.getClusterStatus().getServers().size();
899         if (numberOfServers == 0) {
900           throw new IllegalStateException("No live regionservers");
901         }
902         createWriteTable(numberOfServers);
903       }
904 
905       if (!admin.isTableEnabled(writeTableName)) {
906         admin.enableTable(writeTableName);
907       }
908 
909       int numberOfServers = admin.getClusterStatus().getServers().size();
910       List<HRegionLocation> locations = connection.locateRegions(writeTableName);
911       int numberOfRegions = locations.size();
912       if (numberOfRegions < numberOfServers * regionsLowerLimit
913           || numberOfRegions > numberOfServers * regionsUpperLimit) {
914         admin.disableTable(writeTableName);
915         admin.deleteTable(writeTableName);
916         createWriteTable(numberOfServers);
917       }
918       HashSet<ServerName> serverSet = new HashSet<ServerName>();
919       for (HRegionLocation location: locations) {
920         serverSet.add(location.getServerName());
921       }
922       int numberOfCoveredServers = serverSet.size();
923       if (numberOfCoveredServers < numberOfServers) {
924         admin.balancer();
925       }
926     }
927 
928     private void createWriteTable(int numberOfServers) throws IOException {
929       int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
930       LOG.info("Number of live regionservers: " + numberOfServers + ", "
931           + "pre-splitting the canary table into " + numberOfRegions + " regions "
932           + "(current lower limit of regions per server is " + regionsLowerLimit
933           + " and you can change it by config: "
934           + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
935       HTableDescriptor desc = new HTableDescriptor(writeTableName);
936       HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
937       family.setMaxVersions(1);
938       family.setTimeToLive(writeDataTTL);
939 
940       desc.addFamily(family);
941       byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
942       admin.createTable(desc, splits);
943     }
944   }
945 
946   /**
947    * Canary entry point for specified table.
948    * @throws Exception
949    */
950   public static void sniff(final HConnection connection, TableName tableName, TaskType taskType)
951       throws Exception {
952     List<Future<Void>> taskFutures =
953         Canary.sniff(connection, new StdOutSink(), tableName.getNameAsString(),
954           new ScheduledThreadPoolExecutor(1), taskType);
955     for (Future<Void> future : taskFutures) {
956       future.get();
957     }
958   }
959 
960   /**
961    * Canary entry point for specified table.
962    * @throws Exception
963    */
964   private static List<Future<Void>> sniff(final HConnection connection, final Sink sink,
965     String tableName, ExecutorService executor, TaskType taskType) throws Exception {
966     if (LOG.isDebugEnabled()) {
967       LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s",
968         tableName));
969     }
970     HBaseAdmin admin = new HBaseAdmin(connection);
971     try {
972       if (admin.isTableEnabled(TableName.valueOf(tableName))) {
973         return Canary.sniff(connection, sink, TableName.valueOf(tableName), executor,
974           taskType);
975       } else {
976         LOG.warn(String.format("Table %s is not enabled", tableName));
977       }
978       return new LinkedList<Future<Void>>();
979     } finally {
980       admin.close();
981     }
982   }
983 
984   /*
985    * Loops over regions that owns this table, and output some information abouts the state.
986    */
987   private static List<Future<Void>> sniff(final HConnection connection, final Sink sink,
988       TableName tableName, ExecutorService executor, TaskType taskType) throws Exception {
989     if (LOG.isDebugEnabled()) {
990       LOG.debug(String.format("reading list of regions for table %s", tableName));
991     }
992     HTableInterface table = null;
993     try {
994       table = connection.getTable(tableName);
995     } catch (TableNotFoundException e) {
996       return new ArrayList<Future<Void>>();
997     }
998     List<RegionTask> tasks = new ArrayList<RegionTask>();
999     try {
1000       for (HRegionInfo region : ((HTable)table).getRegionLocations().keySet()) {
1001         tasks.add(new RegionTask(connection, region, sink, taskType));
1002       }
1003     } finally {
1004       table.close();
1005     }
1006     return executor.invokeAll(tasks);
1007   }
1008 
1009   /*
1010    * For each column family of the region tries to get one row and outputs the latency, or the
1011    * failure.
1012    */
1013   private static void sniffRegion(
1014       final HBaseAdmin admin,
1015       final Sink sink,
1016       HRegionInfo region,
1017       HTableInterface table) throws Exception {
1018     HTableDescriptor tableDesc = table.getTableDescriptor();
1019     byte[] startKey = null;
1020     Get get = null;
1021     Scan scan = null;
1022     ResultScanner rs = null;
1023     StopWatch stopWatch = new StopWatch();
1024     for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
1025       stopWatch.reset();
1026       startKey = region.getStartKey();
1027       // Can't do a get on empty start row so do a Scan of first element if any instead.
1028       if (startKey.length > 0) {
1029         get = new Get(startKey);
1030         get.setCacheBlocks(false);
1031         get.setFilter(new FirstKeyOnlyFilter());
1032         get.addFamily(column.getName());
1033       } else {
1034         scan = new Scan();
1035         scan.setRaw(true);
1036         scan.setCaching(1);
1037         scan.setCacheBlocks(false);
1038         scan.setFilter(new FirstKeyOnlyFilter());
1039         scan.addFamily(column.getName());
1040         scan.setMaxResultSize(1L);
1041       }
1042 
1043       try {
1044         if (startKey.length > 0) {
1045           stopWatch.start();
1046           table.get(get);
1047           stopWatch.stop();
1048           sink.publishReadTiming(region, column, stopWatch.getTime());
1049         } else {
1050           stopWatch.start();
1051           rs = table.getScanner(scan);
1052           stopWatch.stop();
1053           sink.publishReadTiming(region, column, stopWatch.getTime());
1054         }
1055       } catch (Exception e) {
1056         sink.publishReadFailure(region, column, e);
1057       } finally {
1058         if (rs != null) {
1059           rs.close();
1060         }
1061         scan = null;
1062         get = null;
1063         startKey = null;
1064       }
1065     }
1066   }
1067   // a monitor for regionserver mode
1068   private static class RegionServerMonitor extends Monitor {
1069 
1070     public RegionServerMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
1071         ExtendedSink sink, ExecutorService executor, boolean treatFailureAsError) {
1072       super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError);
1073     }
1074 
1075     private ExtendedSink getSink() {
1076       return (ExtendedSink) this.sink;
1077     }
1078 
1079     @Override
1080     public void run() {
1081       if (this.initAdmin() && this.checkNoTableNames()) {
1082         Map<String, List<HRegionInfo>> rsAndRMap = this.filterRegionServerByName();
1083         this.initialized = true;
1084         this.monitorRegionServers(rsAndRMap);
1085       }
1086       this.done = true;
1087     }
1088 
1089     private boolean checkNoTableNames() {
1090       List<String> foundTableNames = new ArrayList<String>();
1091       TableName[] tableNames = null;
1092 
1093       if (LOG.isDebugEnabled()) {
1094         LOG.debug(String.format("reading list of tables"));
1095       }
1096       try {
1097         tableNames = this.admin.listTableNames();
1098       } catch (IOException e) {
1099         LOG.error("Get listTableNames failed", e);
1100         this.errorCode = INIT_ERROR_EXIT_CODE;
1101         return false;
1102       }
1103 
1104       if (this.targets == null || this.targets.length == 0) return true;
1105 
1106       for (String target : this.targets) {
1107         for (TableName tableName : tableNames) {
1108           if (target.equals(tableName.getNameAsString())) {
1109             foundTableNames.add(target);
1110           }
1111         }
1112       }
1113 
1114       if (foundTableNames.size() > 0) {
1115         System.err.println("Cannot pass a tablename when using the -regionserver " +
1116             "option, tablenames:" + foundTableNames.toString());
1117         this.errorCode = USAGE_EXIT_CODE;
1118       }
1119       return foundTableNames.size() == 0;
1120     }
1121 
1122     private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap) {
1123       List<RegionServerTask> tasks = new ArrayList<RegionServerTask>();
1124       Random rand =new Random();
1125       // monitor one region on every region server
1126       for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
1127         String serverName = entry.getKey();
1128         // random select a region
1129         HRegionInfo region = entry.getValue().get(rand.nextInt(entry.getValue().size()));
1130         tasks.add(new RegionServerTask(this.connection, serverName, region, getSink()));
1131       }
1132       try {
1133         for (Future<Void> future : this.executor.invokeAll(tasks)) {
1134           try {
1135             future.get();
1136           } catch (ExecutionException e) {
1137             LOG.error("Sniff regionserver failed!", e);
1138             this.errorCode = ERROR_EXIT_CODE;
1139           }
1140         }
1141       } catch (InterruptedException e) {
1142         this.errorCode = ERROR_EXIT_CODE;
1143         LOG.error("Sniff regionserver interrupted!", e);
1144       }
1145     }
1146 
1147     private Map<String, List<HRegionInfo>> filterRegionServerByName() {
1148       Map<String, List<HRegionInfo>> regionServerAndRegionsMap = this.getAllRegionServerByName();
1149       regionServerAndRegionsMap = this.doFilterRegionServerByName(regionServerAndRegionsMap);
1150       return regionServerAndRegionsMap;
1151     }
1152 
1153     private Map<String, List<HRegionInfo>> getAllRegionServerByName() {
1154       Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<String, List<HRegionInfo>>();
1155       HTableInterface table = null;
1156       try {
1157         if (LOG.isDebugEnabled()) {
1158           LOG.debug(String.format("reading list of tables and locations"));
1159         }
1160         HTableDescriptor[] tableDescs = this.admin.listTables();
1161         List<HRegionInfo> regions = null;
1162         for (HTableDescriptor tableDesc : tableDescs) {
1163           table = this.admin.getConnection().getTable(tableDesc.getTableName());
1164           for (Entry<HRegionInfo, ServerName> e: ((HTable)table).getRegionLocations().entrySet()) {
1165             HRegionInfo r = e.getKey();
1166             ServerName rs = e.getValue();
1167             String rsName = rs.getHostname();
1168 
1169             if (rsAndRMap.containsKey(rsName)) {
1170               regions = rsAndRMap.get(rsName);
1171             } else {
1172               regions = new ArrayList<HRegionInfo>();
1173               rsAndRMap.put(rsName, regions);
1174             }
1175             regions.add(r);
1176           }
1177           table.close();
1178         }
1179 
1180       } catch (IOException e) {
1181         String msg = "Get HTables info failed";
1182         LOG.error(msg, e);
1183         this.errorCode = INIT_ERROR_EXIT_CODE;
1184       } finally {
1185         if (table != null) {
1186           try {
1187             table.close();
1188           } catch (IOException e) {
1189             LOG.warn("Close table failed", e);
1190           }
1191         }
1192       }
1193 
1194       return rsAndRMap;
1195     }
1196 
1197     private Map<String, List<HRegionInfo>> doFilterRegionServerByName(
1198         Map<String, List<HRegionInfo>> fullRsAndRMap) {
1199 
1200       Map<String, List<HRegionInfo>> filteredRsAndRMap = null;
1201 
1202       if (this.targets != null && this.targets.length > 0) {
1203         filteredRsAndRMap = new HashMap<String, List<HRegionInfo>>();
1204         Pattern pattern = null;
1205         Matcher matcher = null;
1206         boolean regExpFound = false;
1207         for (String rsName : this.targets) {
1208           if (this.useRegExp) {
1209             regExpFound = false;
1210             pattern = Pattern.compile(rsName);
1211             for (Map.Entry<String, List<HRegionInfo>> entry : fullRsAndRMap.entrySet()) {
1212               matcher = pattern.matcher(entry.getKey());
1213               if (matcher.matches()) {
1214                 filteredRsAndRMap.put(entry.getKey(), entry.getValue());
1215                 regExpFound = true;
1216               }
1217             }
1218             if (!regExpFound) {
1219               LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName);
1220             }
1221           } else {
1222             if (fullRsAndRMap.containsKey(rsName)) {
1223               filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
1224             } else {
1225               LOG.info("No RegionServerInfo found, regionServerName:" + rsName);
1226             }
1227           }
1228         }
1229       } else {
1230         filteredRsAndRMap = fullRsAndRMap;
1231       }
1232       return filteredRsAndRMap;
1233     }
1234   }
1235 
1236   public static void main(String[] args) throws Exception {
1237     final Configuration conf = HBaseConfiguration.create();
1238     
1239     // loading the generic options to conf
1240     new GenericOptionsParser(conf, args);
1241     
1242     AuthUtil.launchAuthChore(conf);  
1243 
1244     int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
1245     LOG.info("Number of exection threads " + numThreads);
1246 
1247     ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
1248 
1249     Class<? extends Sink> sinkClass =
1250         conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class);
1251     Sink sink = ReflectionUtils.newInstance(sinkClass);
1252 
1253     int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
1254     executor.shutdown();
1255     System.exit(exitCode);
1256   }
1257 }