1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.HBaseTestingUtility;
27  import org.apache.hadoop.hbase.HConstants;
28  import org.apache.hadoop.hbase.HRegionInfo;
29  import org.apache.hadoop.hbase.client.HTable;
30  import org.apache.hadoop.hbase.client.Put;
31  import org.apache.hadoop.hbase.client.Result;
32  import org.apache.hadoop.hbase.client.ResultScanner;
33  import org.apache.hadoop.hbase.client.Scan;
34  import org.apache.hadoop.hbase.util.Bytes;
35  import org.apache.hadoop.hbase.util.Writables;
36  import org.junit.AfterClass;
37  import org.junit.Assert;
38  import org.junit.Before;
39  import org.junit.BeforeClass;
40  import org.junit.Ignore;
41  import org.junit.Test;
42  
43  /**
44   * Test transitions of state across the master.  Sets up the cluster once and
45   * then runs a couple of tests.
46   */
47  public class TestMasterTransitions {
48    private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
49    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
50    private static final String TABLENAME = "master_transitions";
51    private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
52      Bytes.toBytes("b"), Bytes.toBytes("c")};
53  
54    /**
55     * Start up a mini cluster and put a small table of many empty regions into it.
56     * @throws Exception
57     */
58    @BeforeClass public static void beforeAllTests() throws Exception {
59      TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
60      TEST_UTIL.startMiniCluster(2);
61      // Create a table of three families.  This will assign a region.
62      TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES);
63      HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
64      int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily());
65      TEST_UTIL.waitUntilAllRegionsAssigned(countOfRegions);
66      addToEachStartKey(countOfRegions);
67    }
68  
69    @AfterClass public static void afterAllTests() throws IOException {
70      TEST_UTIL.shutdownMiniCluster();
71    }
72  
73    @Before public void setup() throws IOException {
74      TEST_UTIL.ensureSomeRegionServersAvailable(2);
75    }
76  
77    /**
78     * Listener for regionserver events testing hbase-2428 (Infinite loop of
79     * region closes if META region is offline).  In particular, listen
80     * for the close of the 'metaServer' and when it comes in, requeue it with a
81     * delay as though there were an issue processing the shutdown.  As part of
82     * the requeuing,  send over a close of a region on 'otherServer' so it comes
83     * into a master that has its meta region marked as offline.
84     */
85    /*
86    static class HBase2428Listener implements RegionServerOperationListener {
87      // Map of what we've delayed so we don't do do repeated delays.
88      private final Set<RegionServerOperation> postponed =
89        new CopyOnWriteArraySet<RegionServerOperation>();
90      private boolean done = false;;
91      private boolean metaShutdownReceived = false;
92      private final HServerAddress metaAddress;
93      private final MiniHBaseCluster cluster;
94      private final int otherServerIndex;
95      private final HRegionInfo hri;
96      private int closeCount = 0;
97      static final int SERVER_DURATION = 3 * 1000;
98      static final int CLOSE_DURATION = 1 * 1000;
99   
100     HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
101         final HRegionInfo closingHRI, final int otherServerIndex) {
102       this.cluster = c;
103       this.metaAddress = metaAddress;
104       this.hri = closingHRI;
105       this.otherServerIndex = otherServerIndex;
106     }
107 
108     @Override
109     public boolean process(final RegionServerOperation op) throws IOException {
110       // If a regionserver shutdown and its of the meta server, then we want to
111       // delay the processing of the shutdown and send off a close of a region on
112       // the 'otherServer.
113       boolean result = true;
114       if (op instanceof ProcessServerShutdown) {
115         ProcessServerShutdown pss = (ProcessServerShutdown)op;
116         if (pss.getDeadServerAddress().equals(this.metaAddress)) {
117           // Don't postpone more than once.
118           if (!this.postponed.contains(pss)) {
119             // Close some region.
120             this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
121               new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
122               Bytes.toBytes("Forcing close in test")));
123             this.postponed.add(pss);
124             // Put off the processing of the regionserver shutdown processing.
125             pss.setDelay(SERVER_DURATION);
126             this.metaShutdownReceived = true;
127             // Return false.  This will add this op to the delayed queue.
128             result = false;
129           }
130         }
131       } else {
132         // Have the close run frequently.
133         if (isWantedCloseOperation(op) != null) {
134           op.setDelay(CLOSE_DURATION);
135           // Count how many times it comes through here.
136           this.closeCount++;
137         }
138       }
139       return result;
140     }
141 
142     public void processed(final RegionServerOperation op) {
143       if (isWantedCloseOperation(op) != null) return;
144       this.done = true;
145     }
146 */
147     /*
148      * @param op
149      * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
150      * cast as a ProcessRegionClose.
151      */
152   /*
153     private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
154       // Count every time we get a close operation.
155       if (op instanceof ProcessRegionClose) {
156         ProcessRegionClose c = (ProcessRegionClose)op;
157         if (c.regionInfo.equals(hri)) {
158           return c;
159         }
160       }
161       return null;
162     }
163 
164     boolean isDone() {
165       return this.done;
166     }
167 
168     boolean isMetaShutdownReceived() {
169       return metaShutdownReceived;
170     }
171 
172     int getCloseCount() {
173       return this.closeCount;
174     }
175 
176     @Override
177     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
178       return true;
179     }
180   }
181 */
182   /**
183    * In 2428, the meta region has just been set offline and then a close comes
184    * in.
185    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 
186    */
187   @Ignore @Test  (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
188   throws Exception {
189     /*
190     LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
191     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
192     final HMaster master = cluster.getMaster();
193     int metaIndex = cluster.getServerWithMeta();
194     // Figure the index of the server that is not server the .META.
195     int otherServerIndex = -1;
196     for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
197       if (i == metaIndex) continue;
198       otherServerIndex = i;
199       break;
200     }
201     final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
202     final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
203 
204     // Get a region out on the otherServer.
205     final HRegionInfo hri =
206       otherServer.getOnlineRegions().iterator().next().getRegionInfo();
207  
208     // Add our RegionServerOperationsListener
209     HBase2428Listener listener = new HBase2428Listener(cluster,
210       metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
211     master.getRegionServerOperationQueue().
212       registerRegionServerOperationListener(listener);
213     try {
214       // Now close the server carrying meta.
215       cluster.abortRegionServer(metaIndex);
216 
217       // First wait on receipt of meta server shutdown message.
218       while(!listener.metaShutdownReceived) Threads.sleep(100);
219       while(!listener.isDone()) Threads.sleep(10);
220       // We should not have retried the close more times than it took for the
221       // server shutdown message to exit the delay queue and get processed
222       // (Multiple by two to add in some slop in case of GC or something).
223       assertTrue(listener.getCloseCount() > 1);
224       assertTrue(listener.getCloseCount() <
225         ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
226 
227       // Assert the closed region came back online
228       assertRegionIsBackOnline(hri);
229     } finally {
230       master.getRegionServerOperationQueue().
231         unregisterRegionServerOperationListener(listener);
232     }
233     */
234   }
235 
236   /**
237    * Test adding in a new server before old one on same host+port is dead.
238    * Make the test more onerous by having the server under test carry the meta.
239    * If confusion between old and new, purportedly meta never comes back.  Test
240    * that meta gets redeployed.
241    */
242   @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
243   throws IOException {
244     /*
245     LOG.info("Running testAddingServerBeforeOldIsDead2413");
246     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
247     int count = count();
248     int metaIndex = cluster.getServerWithMeta();
249     MiniHBaseClusterRegionServer metaHRS =
250       (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
251     int port = metaHRS.getServerInfo().getServerAddress().getPort();
252     Configuration c = TEST_UTIL.getConfiguration();
253     String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
254     try {
255       LOG.info("KILLED=" + metaHRS);
256       metaHRS.kill();
257       c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
258       // Try and start new regionserver.  It might clash with the old
259       // regionserver port so keep trying to get past the BindException.
260       HRegionServer hrs = null;
261       while (true) {
262         try {
263           hrs = cluster.startRegionServer().getRegionServer();
264           break;
265         } catch (IOException e) {
266           if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
267             InvocationTargetException ee = (InvocationTargetException)e.getCause();
268             if (ee.getCause() != null && ee.getCause() instanceof BindException) {
269               LOG.info("BindException; retrying: " + e.toString());
270             }
271           }
272         }
273       }
274       LOG.info("STARTED=" + hrs);
275       // Wait until he's been given at least 3 regions before we go on to try
276       // and count rows in table.
277       while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
278       LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
279         " regions");
280       assertEquals(count, count());
281     } finally {
282       c.set(HConstants.REGIONSERVER_PORT, oldPort);
283     }
284     */
285   }
286 
287   /**
288    * HBase2482 is about outstanding region openings.  If any are outstanding
289    * when a regionserver goes down, then they'll never deploy.  They'll be
290    * stuck in the regions-in-transition list for ever.  This listener looks
291    * for a region opening HMsg and if its from the server passed on construction,
292    * then we kill it.  It also looks out for a close message on the victim
293    * server because that signifies start of the fireworks.
294    */
295   /*
296   static class HBase2482Listener implements RegionServerOperationListener {
297     private final HRegionServer victim;
298     private boolean abortSent = false;
299     // We closed regions on new server.
300     private volatile boolean closed = false;
301     // Copy of regions on new server
302     private final Collection<HRegion> copyOfOnlineRegions;
303     // This is the region that was in transition on the server we aborted. Test
304     // passes if this region comes back online successfully.
305     private HRegionInfo regionToFind;
306 
307     HBase2482Listener(final HRegionServer victim) {
308       this.victim = victim;
309       // Copy regions currently open on this server so I can notice when
310       // there is a close.
311       this.copyOfOnlineRegions =
312         this.victim.getCopyOfOnlineRegionsSortedBySize().values();
313     }
314  
315     @Override
316     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
317       if (!victim.getServerInfo().equals(serverInfo) ||
318           this.abortSent || !this.closed) {
319         return true;
320       }
321       if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
322       // Save the region that is in transition so can test later it came back.
323       this.regionToFind = incomingMsg.getRegionInfo();
324       String msg = "ABORTING " + this.victim + " because got a " +
325         HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
326         incomingMsg.getRegionInfo().getRegionNameAsString();
327       this.victim.abort(msg);
328       this.abortSent = true;
329       return true;
330     }
331 
332     @Override
333     public boolean process(RegionServerOperation op) throws IOException {
334       return true;
335     }
336 
337     @Override
338     public void processed(RegionServerOperation op) {
339       if (this.closed || !(op instanceof ProcessRegionClose)) return;
340       ProcessRegionClose close = (ProcessRegionClose)op;
341       for (HRegion r: this.copyOfOnlineRegions) {
342         if (r.getRegionInfo().equals(close.regionInfo)) {
343           // We've closed one of the regions that was on the victim server.
344           // Now can start testing for when all regions are back online again
345           LOG.info("Found close of " +
346             r.getRegionInfo().getRegionNameAsString() +
347             "; setting close happened flag");
348           this.closed = true;
349           break;
350         }
351       }
352     }
353   }
354 */
355   /**
356    * In 2482, a RS with an opening region on it dies.  The said region is then
357    * stuck in the master's regions-in-transition and never leaves it.  This
358    * test works by bringing up a new regionserver, waiting for the load
359    * balancer to give it some regions.  Then, we close all on the new server.
360    * After sending all the close messages, we send the new regionserver the
361    * special blocking message so it can not process any more messages.
362    * Meantime reopening of the just-closed regions is backed up on the new
363    * server.  Soon as master gets an opening region from the new regionserver,
364    * we kill it.  We then wait on all regions to come back on line.  If bug
365    * is fixed, this should happen soon as the processing of the killed server is
366    * done.
367    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 
368    */
369   @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
370   throws Exception {
371     /*
372     LOG.info("Running testKillRSWithOpeningRegion2482");
373     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
374     if (cluster.getLiveRegionServerThreads().size() < 2) {
375       // Need at least two servers.
376       cluster.startRegionServer();
377     }
378     // Count how many regions are online.  They need to be all back online for
379     // this test to succeed.
380     int countOfMetaRegions = countOfMetaRegions();
381     // Add a listener on the server.
382     HMaster m = cluster.getMaster();
383     // Start new regionserver.
384     MiniHBaseClusterRegionServer hrs =
385       (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
386     LOG.info("Started new regionserver: " + hrs.toString());
387     // Wait until has some regions before proceeding.  Balancer will give it some.
388     int minimumRegions =
389       countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
390     while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
391     // Set the listener only after some regions have been opened on new server.
392     HBase2482Listener listener = new HBase2482Listener(hrs);
393     m.getRegionServerOperationQueue().
394       registerRegionServerOperationListener(listener);
395     try {
396       // Go close all non-catalog regions on this new server
397       closeAllNonCatalogRegions(cluster, hrs);
398       // After all closes, add blocking message before the region opens start to
399       // come in.
400       cluster.addMessageToSendRegionServer(hrs,
401         new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
402       // Wait till one of the above close messages has an effect before we start
403       // wait on all regions back online.
404       while (!listener.closed) Threads.sleep(100);
405       LOG.info("Past close");
406       // Make sure the abort server message was sent.
407       while(!listener.abortSent) Threads.sleep(100);
408       LOG.info("Past abort send; waiting on all regions to redeploy");
409       // Now wait for regions to come back online.
410       assertRegionIsBackOnline(listener.regionToFind);
411     } finally {
412       m.getRegionServerOperationQueue().
413         unregisterRegionServerOperationListener(listener);
414     }
415     */
416   }
417 
418   /*
419    * @return Count of all non-catalog regions on the designated server
420    */
421 /*
422   private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
423     final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
424   throws IOException {
425     int countOfRegions = 0;
426     for (HRegion r: hrs.getOnlineRegions()) {
427       if (r.getRegionInfo().isMetaRegion()) continue;
428       cluster.addMessageToSendRegionServer(hrs,
429         new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
430       LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
431         " on " + hrs.toString());
432       countOfRegions++;
433     }
434     return countOfRegions;
435   }
436 
437   private void assertRegionIsBackOnline(final HRegionInfo hri)
438   throws IOException {
439     // Region should have an entry in its startkey because of addRowToEachRegion.
440     byte [] row = getStartKey(hri);
441     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
442     Get g =  new Get(row);
443     assertTrue((t.get(g)).size() > 0);
444   }
445 
446   /*
447    * @return Count of regions in meta table.
448    * @throws IOException
449    */
450   /*
451   private static int countOfMetaRegions()
452   throws IOException {
453     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
454       HConstants.META_TABLE_NAME);
455     int rows = 0;
456     Scan scan = new Scan();
457     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
458     ResultScanner s = meta.getScanner(scan);
459     for (Result r = null; (r = s.next()) != null;) {
460       byte [] b =
461         r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
462       if (b == null || b.length <= 0) break;
463       rows++;
464     }
465     s.close();
466     return rows;
467   }
468 */
469   /*
470    * Add to each of the regions in .META. a value.  Key is the startrow of the
471    * region (except its 'aaa' for first region).  Actual value is the row name.
472    * @param expected
473    * @return
474    * @throws IOException
475    */
476   private static int addToEachStartKey(final int expected) throws IOException {
477     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
478     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
479         HConstants.META_TABLE_NAME);
480     int rows = 0;
481     Scan scan = new Scan();
482     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
483     ResultScanner s = meta.getScanner(scan);
484     for (Result r = null; (r = s.next()) != null;) {
485       byte [] b =
486         r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
487       if (b == null || b.length <= 0) break;
488       HRegionInfo hri = Writables.getHRegionInfo(b);
489       // If start key, add 'aaa'.
490       byte [] row = getStartKey(hri);
491       Put p = new Put(row);
492       p.add(getTestFamily(), getTestQualifier(), row);
493       t.put(p);
494       rows++;
495     }
496     s.close();
497     Assert.assertEquals(expected, rows);
498     return rows;
499   }
500 
501   /*
502    * @return Count of rows in TABLENAME
503    * @throws IOException
504    */
505   private static int count() throws IOException {
506     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
507     int rows = 0;
508     Scan scan = new Scan();
509     ResultScanner s = t.getScanner(scan);
510     for (Result r = null; (r = s.next()) != null;) {
511       rows++;
512     }
513     s.close();
514     LOG.info("Counted=" + rows);
515     return rows;
516   }
517 
518   /*
519    * @param hri
520    * @return Start key for hri (If start key is '', then return 'aaa'.
521    */
522   private static byte [] getStartKey(final HRegionInfo hri) {
523     return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
524         Bytes.toBytes("aaa"): hri.getStartKey();
525   }
526 
527   private static byte [] getTestFamily() {
528     return FAMILIES[0];
529   }
530 
531   private static byte [] getTestQualifier() {
532     return getTestFamily();
533   }
534 }