1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.replication;
20  
21  
22  import org.apache.commons.logging.Log;
23  import org.apache.commons.logging.LogFactory;
24  import org.apache.hadoop.hbase.HBaseTestingUtility;
25  import org.apache.hadoop.hbase.LargeTests;
26  import org.apache.hadoop.hbase.UnknownScannerException;
27  import org.apache.hadoop.hbase.client.Result;
28  import org.apache.hadoop.hbase.client.ResultScanner;
29  import org.apache.hadoop.hbase.client.Scan;
30  import org.junit.Test;
31  import org.junit.experimental.categories.Category;
32  
33  import static org.junit.Assert.fail;
34  
35  @Category(LargeTests.class)
36  public class TestReplicationQueueFailover extends TestReplicationBase {
37  
38    private static final Log LOG = LogFactory.getLog(TestReplicationQueueFailover.class);
39  
40    /**
41     * Load up multiple tables over 2 region servers and kill a source during
42     * the upload. The failover happens internally.
43     *
44     * WARNING this test sometimes fails because of HBASE-3515
45     *
46     * @throws Exception
47     */
48    @Test(timeout=300000)
49    public void queueFailover() throws Exception {
50      // killing the RS with .META. can result into failed puts until we solve
51      // IO fencing
52      int rsToKill1 =
53          utility1.getHBaseCluster().getServerWithMeta() == 0 ? 1 : 0;
54      int rsToKill2 =
55          utility2.getHBaseCluster().getServerWithMeta() == 0 ? 1 : 0;
56  
57      // Takes about 20 secs to run the full loading, kill around the middle
58      Thread killer1 = killARegionServer(utility1, 7500, rsToKill1);
59      Thread killer2 = killARegionServer(utility2, 10000, rsToKill2);
60  
61      LOG.info("Start loading table");
62      int initialCount = utility1.loadTable(htable1, famName);
63      LOG.info("Done loading table");
64      killer1.join(5000);
65      killer2.join(5000);
66      LOG.info("Done waiting for threads");
67  
68      Result[] res;
69      while (true) {
70        try {
71          Scan scan = new Scan();
72          ResultScanner scanner = htable1.getScanner(scan);
73          res = scanner.next(initialCount);
74          scanner.close();
75          break;
76        } catch (UnknownScannerException ex) {
77          LOG.info("Cluster wasn't ready yet, restarting scanner");
78        }
79      }
80      // Test we actually have all the rows, we may miss some because we
81      // don't have IO fencing.
82      if (res.length != initialCount) {
83        LOG.warn("We lost some rows on the master cluster!");
84        // We don't really expect the other cluster to have more rows
85        initialCount = res.length;
86      }
87  
88      int lastCount = 0;
89  
90      final long start = System.currentTimeMillis();
91      int i = 0;
92      while (true) {
93        if (i==NB_RETRIES-1) {
94          fail("Waited too much time for queueFailover replication. " +
95              "Waited "+(System.currentTimeMillis() - start)+"ms.");
96        }
97        Scan scan2 = new Scan();
98        ResultScanner scanner2 = htable2.getScanner(scan2);
99        Result[] res2 = scanner2.next(initialCount * 2);
100       scanner2.close();
101       if (res2.length < initialCount) {
102         if (lastCount < res2.length) {
103           i--; // Don't increment timeout if we make progress
104         } else {
105           i++;
106         }
107         lastCount = res2.length;
108         LOG.info("Only got " + lastCount + " rows instead of " +
109             initialCount + " current i=" + i);
110         Thread.sleep(SLEEP_TIME*2);
111       } else {
112         break;
113       }
114     }
115   }
116 
117   private static Thread killARegionServer(final HBaseTestingUtility utility,
118                                           final long timeout, final int rs) {
119     Thread killer = new Thread() {
120       public void run() {
121         try {
122           Thread.sleep(timeout);
123           utility.expireRegionServerSession(rs);
124         } catch (Exception e) {
125           LOG.error("Couldn't kill a region server", e);
126         }
127       }
128     };
129     killer.setDaemon(true);
130     killer.start();
131     return killer;
132   }
133 }