1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.*;
27  import org.apache.hadoop.hbase.client.HTable;
28  import org.apache.hadoop.hbase.client.Put;
29  import org.apache.hadoop.hbase.client.Result;
30  import org.apache.hadoop.hbase.client.ResultScanner;
31  import org.apache.hadoop.hbase.client.Scan;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.apache.hadoop.hbase.util.Writables;
34  import org.junit.AfterClass;
35  import org.junit.Assert;
36  import org.junit.Before;
37  import org.junit.BeforeClass;
38  import org.junit.Ignore;
39  import org.junit.Test;
40  import org.junit.experimental.categories.Category;
41  
42  /**
43   * Test transitions of state across the master.  Sets up the cluster once and
44   * then runs a couple of tests.
45   */
46  @Category(LargeTests.class)
47  public class TestMasterTransitions {
48    private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
49    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
50    private static final String TABLENAME = "master_transitions";
51    private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
52      Bytes.toBytes("b"), Bytes.toBytes("c")};
53  
54    /**
55     * Start up a mini cluster and put a small table of many empty regions into it.
56     * @throws Exception
57     */
58    @BeforeClass public static void beforeAllTests() throws Exception {
59      TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
60      TEST_UTIL.startMiniCluster(2);
61      // Create a table of three families.  This will assign a region.
62      TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES);
63      HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
64      int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily());
65      TEST_UTIL.waitUntilAllRegionsAssigned(countOfRegions);
66      addToEachStartKey(countOfRegions);
67      t.close();
68    }
69  
70    @AfterClass public static void afterAllTests() throws Exception {
71      TEST_UTIL.shutdownMiniCluster();
72    }
73  
74    @Before public void setup() throws IOException {
75      TEST_UTIL.ensureSomeRegionServersAvailable(2);
76    }
77  
78    /**
79     * Listener for regionserver events testing hbase-2428 (Infinite loop of
80     * region closes if META region is offline).  In particular, listen
81     * for the close of the 'metaServer' and when it comes in, requeue it with a
82     * delay as though there were an issue processing the shutdown.  As part of
83     * the requeuing,  send over a close of a region on 'otherServer' so it comes
84     * into a master that has its meta region marked as offline.
85     */
86    /*
87    static class HBase2428Listener implements RegionServerOperationListener {
88      // Map of what we've delayed so we don't do do repeated delays.
89      private final Set<RegionServerOperation> postponed =
90        new CopyOnWriteArraySet<RegionServerOperation>();
91      private boolean done = false;;
92      private boolean metaShutdownReceived = false;
93      private final HServerAddress metaAddress;
94      private final MiniHBaseCluster cluster;
95      private final int otherServerIndex;
96      private final HRegionInfo hri;
97      private int closeCount = 0;
98      static final int SERVER_DURATION = 3 * 1000;
99      static final int CLOSE_DURATION = 1 * 1000;
100  
101     HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
102         final HRegionInfo closingHRI, final int otherServerIndex) {
103       this.cluster = c;
104       this.metaAddress = metaAddress;
105       this.hri = closingHRI;
106       this.otherServerIndex = otherServerIndex;
107     }
108 
109     @Override
110     public boolean process(final RegionServerOperation op) throws IOException {
111       // If a regionserver shutdown and its of the meta server, then we want to
112       // delay the processing of the shutdown and send off a close of a region on
113       // the 'otherServer.
114       boolean result = true;
115       if (op instanceof ProcessServerShutdown) {
116         ProcessServerShutdown pss = (ProcessServerShutdown)op;
117         if (pss.getDeadServerAddress().equals(this.metaAddress)) {
118           // Don't postpone more than once.
119           if (!this.postponed.contains(pss)) {
120             // Close some region.
121             this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
122               new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
123               Bytes.toBytes("Forcing close in test")));
124             this.postponed.add(pss);
125             // Put off the processing of the regionserver shutdown processing.
126             pss.setDelay(SERVER_DURATION);
127             this.metaShutdownReceived = true;
128             // Return false.  This will add this op to the delayed queue.
129             result = false;
130           }
131         }
132       } else {
133         // Have the close run frequently.
134         if (isWantedCloseOperation(op) != null) {
135           op.setDelay(CLOSE_DURATION);
136           // Count how many times it comes through here.
137           this.closeCount++;
138         }
139       }
140       return result;
141     }
142 
143     public void processed(final RegionServerOperation op) {
144       if (isWantedCloseOperation(op) != null) return;
145       this.done = true;
146     }
147 */
148     /*
149      * @param op
150      * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
151      * cast as a ProcessRegionClose.
152      */
153   /*
154     private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
155       // Count every time we get a close operation.
156       if (op instanceof ProcessRegionClose) {
157         ProcessRegionClose c = (ProcessRegionClose)op;
158         if (c.regionInfo.equals(hri)) {
159           return c;
160         }
161       }
162       return null;
163     }
164 
165     boolean isDone() {
166       return this.done;
167     }
168 
169     boolean isMetaShutdownReceived() {
170       return metaShutdownReceived;
171     }
172 
173     int getCloseCount() {
174       return this.closeCount;
175     }
176 
177     @Override
178     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
179       return true;
180     }
181   }
182 */
183   /**
184    * In 2428, the meta region has just been set offline and then a close comes
185    * in.
186    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 
187    */
188   @Ignore @Test  (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
189   throws Exception {
190     /*
191     LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
192     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
193     final HMaster master = cluster.getMaster();
194     int metaIndex = cluster.getServerWithMeta();
195     // Figure the index of the server that is not server the .META.
196     int otherServerIndex = -1;
197     for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
198       if (i == metaIndex) continue;
199       otherServerIndex = i;
200       break;
201     }
202     final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
203     final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
204 
205     // Get a region out on the otherServer.
206     final HRegionInfo hri =
207       otherServer.getOnlineRegions().iterator().next().getRegionInfo();
208  
209     // Add our RegionServerOperationsListener
210     HBase2428Listener listener = new HBase2428Listener(cluster,
211       metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
212     master.getRegionServerOperationQueue().
213       registerRegionServerOperationListener(listener);
214     try {
215       // Now close the server carrying meta.
216       cluster.abortRegionServer(metaIndex);
217 
218       // First wait on receipt of meta server shutdown message.
219       while(!listener.metaShutdownReceived) Threads.sleep(100);
220       while(!listener.isDone()) Threads.sleep(10);
221       // We should not have retried the close more times than it took for the
222       // server shutdown message to exit the delay queue and get processed
223       // (Multiple by two to add in some slop in case of GC or something).
224       assertTrue(listener.getCloseCount() > 1);
225       assertTrue(listener.getCloseCount() <
226         ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
227 
228       // Assert the closed region came back online
229       assertRegionIsBackOnline(hri);
230     } finally {
231       master.getRegionServerOperationQueue().
232         unregisterRegionServerOperationListener(listener);
233     }
234     */
235   }
236 
237   /**
238    * Test adding in a new server before old one on same host+port is dead.
239    * Make the test more onerous by having the server under test carry the meta.
240    * If confusion between old and new, purportedly meta never comes back.  Test
241    * that meta gets redeployed.
242    */
243   @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
244   throws IOException {
245     /*
246     LOG.info("Running testAddingServerBeforeOldIsDead2413");
247     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
248     int count = count();
249     int metaIndex = cluster.getServerWithMeta();
250     MiniHBaseClusterRegionServer metaHRS =
251       (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
252     int port = metaHRS.getServerInfo().getServerAddress().getPort();
253     Configuration c = TEST_UTIL.getConfiguration();
254     String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
255     try {
256       LOG.info("KILLED=" + metaHRS);
257       metaHRS.kill();
258       c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
259       // Try and start new regionserver.  It might clash with the old
260       // regionserver port so keep trying to get past the BindException.
261       HRegionServer hrs = null;
262       while (true) {
263         try {
264           hrs = cluster.startRegionServer().getRegionServer();
265           break;
266         } catch (IOException e) {
267           if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
268             InvocationTargetException ee = (InvocationTargetException)e.getCause();
269             if (ee.getCause() != null && ee.getCause() instanceof BindException) {
270               LOG.info("BindException; retrying: " + e.toString());
271             }
272           }
273         }
274       }
275       LOG.info("STARTED=" + hrs);
276       // Wait until he's been given at least 3 regions before we go on to try
277       // and count rows in table.
278       while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
279       LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
280         " regions");
281       assertEquals(count, count());
282     } finally {
283       c.set(HConstants.REGIONSERVER_PORT, oldPort);
284     }
285     */
286   }
287 
288   /**
289    * HBase2482 is about outstanding region openings.  If any are outstanding
290    * when a regionserver goes down, then they'll never deploy.  They'll be
291    * stuck in the regions-in-transition list for ever.  This listener looks
292    * for a region opening HMsg and if its from the server passed on construction,
293    * then we kill it.  It also looks out for a close message on the victim
294    * server because that signifies start of the fireworks.
295    */
296   /*
297   static class HBase2482Listener implements RegionServerOperationListener {
298     private final HRegionServer victim;
299     private boolean abortSent = false;
300     // We closed regions on new server.
301     private volatile boolean closed = false;
302     // Copy of regions on new server
303     private final Collection<HRegion> copyOfOnlineRegions;
304     // This is the region that was in transition on the server we aborted. Test
305     // passes if this region comes back online successfully.
306     private HRegionInfo regionToFind;
307 
308     HBase2482Listener(final HRegionServer victim) {
309       this.victim = victim;
310       // Copy regions currently open on this server so I can notice when
311       // there is a close.
312       this.copyOfOnlineRegions =
313         this.victim.getCopyOfOnlineRegionsSortedBySize().values();
314     }
315  
316     @Override
317     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
318       if (!victim.getServerInfo().equals(serverInfo) ||
319           this.abortSent || !this.closed) {
320         return true;
321       }
322       if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
323       // Save the region that is in transition so can test later it came back.
324       this.regionToFind = incomingMsg.getRegionInfo();
325       String msg = "ABORTING " + this.victim + " because got a " +
326         HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
327         incomingMsg.getRegionInfo().getRegionNameAsString();
328       this.victim.abort(msg);
329       this.abortSent = true;
330       return true;
331     }
332 
333     @Override
334     public boolean process(RegionServerOperation op) throws IOException {
335       return true;
336     }
337 
338     @Override
339     public void processed(RegionServerOperation op) {
340       if (this.closed || !(op instanceof ProcessRegionClose)) return;
341       ProcessRegionClose close = (ProcessRegionClose)op;
342       for (HRegion r: this.copyOfOnlineRegions) {
343         if (r.getRegionInfo().equals(close.regionInfo)) {
344           // We've closed one of the regions that was on the victim server.
345           // Now can start testing for when all regions are back online again
346           LOG.info("Found close of " +
347             r.getRegionInfo().getRegionNameAsString() +
348             "; setting close happened flag");
349           this.closed = true;
350           break;
351         }
352       }
353     }
354   }
355 */
356   /**
357    * In 2482, a RS with an opening region on it dies.  The said region is then
358    * stuck in the master's regions-in-transition and never leaves it.  This
359    * test works by bringing up a new regionserver, waiting for the load
360    * balancer to give it some regions.  Then, we close all on the new server.
361    * After sending all the close messages, we send the new regionserver the
362    * special blocking message so it can not process any more messages.
363    * Meantime reopening of the just-closed regions is backed up on the new
364    * server.  Soon as master gets an opening region from the new regionserver,
365    * we kill it.  We then wait on all regions to come back on line.  If bug
366    * is fixed, this should happen soon as the processing of the killed server is
367    * done.
368    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 
369    */
370   @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
371   throws Exception {
372     /*
373     LOG.info("Running testKillRSWithOpeningRegion2482");
374     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
375     if (cluster.getLiveRegionServerThreads().size() < 2) {
376       // Need at least two servers.
377       cluster.startRegionServer();
378     }
379     // Count how many regions are online.  They need to be all back online for
380     // this test to succeed.
381     int countOfMetaRegions = countOfMetaRegions();
382     // Add a listener on the server.
383     HMaster m = cluster.getMaster();
384     // Start new regionserver.
385     MiniHBaseClusterRegionServer hrs =
386       (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
387     LOG.info("Started new regionserver: " + hrs.toString());
388     // Wait until has some regions before proceeding.  Balancer will give it some.
389     int minimumRegions =
390       countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
391     while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
392     // Set the listener only after some regions have been opened on new server.
393     HBase2482Listener listener = new HBase2482Listener(hrs);
394     m.getRegionServerOperationQueue().
395       registerRegionServerOperationListener(listener);
396     try {
397       // Go close all non-catalog regions on this new server
398       closeAllNonCatalogRegions(cluster, hrs);
399       // After all closes, add blocking message before the region opens start to
400       // come in.
401       cluster.addMessageToSendRegionServer(hrs,
402         new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
403       // Wait till one of the above close messages has an effect before we start
404       // wait on all regions back online.
405       while (!listener.closed) Threads.sleep(100);
406       LOG.info("Past close");
407       // Make sure the abort server message was sent.
408       while(!listener.abortSent) Threads.sleep(100);
409       LOG.info("Past abort send; waiting on all regions to redeploy");
410       // Now wait for regions to come back online.
411       assertRegionIsBackOnline(listener.regionToFind);
412     } finally {
413       m.getRegionServerOperationQueue().
414         unregisterRegionServerOperationListener(listener);
415     }
416     */
417   }
418 
419   /*
420    * @return Count of all non-catalog regions on the designated server
421    */
422 /*
423   private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
424     final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
425   throws IOException {
426     int countOfRegions = 0;
427     for (HRegion r: hrs.getOnlineRegions()) {
428       if (r.getRegionInfo().isMetaRegion()) continue;
429       cluster.addMessageToSendRegionServer(hrs,
430         new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
431       LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
432         " on " + hrs.toString());
433       countOfRegions++;
434     }
435     return countOfRegions;
436   }
437 
438   private void assertRegionIsBackOnline(final HRegionInfo hri)
439   throws IOException {
440     // Region should have an entry in its startkey because of addRowToEachRegion.
441     byte [] row = getStartKey(hri);
442     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
443     Get g =  new Get(row);
444     assertTrue((t.get(g)).size() > 0);
445   }
446 
447   /*
448    * @return Count of regions in meta table.
449    * @throws IOException
450    */
451   /*
452   private static int countOfMetaRegions()
453   throws IOException {
454     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
455       HConstants.META_TABLE_NAME);
456     int rows = 0;
457     Scan scan = new Scan();
458     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
459     ResultScanner s = meta.getScanner(scan);
460     for (Result r = null; (r = s.next()) != null;) {
461       byte [] b =
462         r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
463       if (b == null || b.length <= 0) break;
464       rows++;
465     }
466     s.close();
467     return rows;
468   }
469 */
470   /*
471    * Add to each of the regions in .META. a value.  Key is the startrow of the
472    * region (except its 'aaa' for first region).  Actual value is the row name.
473    * @param expected
474    * @return
475    * @throws IOException
476    */
477   private static int addToEachStartKey(final int expected) throws IOException {
478     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
479     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
480         HConstants.META_TABLE_NAME);
481     int rows = 0;
482     Scan scan = new Scan();
483     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
484     ResultScanner s = meta.getScanner(scan);
485     for (Result r = null; (r = s.next()) != null;) {
486       byte [] b =
487         r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
488       if (b == null || b.length <= 0) break;
489       HRegionInfo hri = Writables.getHRegionInfo(b);
490       // If start key, add 'aaa'.
491       byte [] row = getStartKey(hri);
492       Put p = new Put(row);
493       p.setWriteToWAL(false);
494       p.add(getTestFamily(), getTestQualifier(), row);
495       t.put(p);
496       rows++;
497     }
498     s.close();
499     Assert.assertEquals(expected, rows);
500     t.close();
501     meta.close();
502     return rows;
503   }
504 
505   /*
506    * @return Count of rows in TABLENAME
507    * @throws IOException
508    */
509   private static int count() throws IOException {
510     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
511     int rows = 0;
512     Scan scan = new Scan();
513     ResultScanner s = t.getScanner(scan);
514     for (Result r = null; (r = s.next()) != null;) {
515       rows++;
516     }
517     s.close();
518     LOG.info("Counted=" + rows);
519     t.close();
520     return rows;
521   }
522 
523   /*
524    * @param hri
525    * @return Start key for hri (If start key is '', then return 'aaa'.
526    */
527   private static byte [] getStartKey(final HRegionInfo hri) {
528     return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
529         Bytes.toBytes("aaa"): hri.getStartKey();
530   }
531 
532   private static byte [] getTestFamily() {
533     return FAMILIES[0];
534   }
535 
536   private static byte [] getTestQualifier() {
537     return getTestFamily();
538   }
539 
540   @org.junit.Rule
541   public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
542     new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();
543 }
544