1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.*;
27  import org.apache.hadoop.hbase.client.HTable;
28  import org.apache.hadoop.hbase.client.Put;
29  import org.apache.hadoop.hbase.client.Result;
30  import org.apache.hadoop.hbase.client.ResultScanner;
31  import org.apache.hadoop.hbase.client.Scan;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.apache.hadoop.hbase.util.Writables;
34  import org.junit.AfterClass;
35  import org.junit.Assert;
36  import org.junit.Before;
37  import org.junit.BeforeClass;
38  import org.junit.Ignore;
39  import org.junit.Test;
40  import org.junit.experimental.categories.Category;
41  
42  /**
43   * Test transitions of state across the master.  Sets up the cluster once and
44   * then runs a couple of tests.
45   */
46  @Category(LargeTests.class)
47  public class TestMasterTransitions {
48    private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
49    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
50    private static final String TABLENAME = "master_transitions";
51    private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
52      Bytes.toBytes("b"), Bytes.toBytes("c")};
53  
54    /**
55     * Start up a mini cluster and put a small table of many empty regions into it.
56     * @throws Exception
57     */
58    @BeforeClass public static void beforeAllTests() throws Exception {
59      TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
60      TEST_UTIL.startMiniCluster(2);
61      // Create a table of three families.  This will assign a region.
62      byte[] tableName = Bytes.toBytes(TABLENAME);
63      TEST_UTIL.createTable(tableName, FAMILIES);
64      HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
65      int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily());
66      TEST_UTIL.waitUntilAllRegionsAssigned(tableName);
67      addToEachStartKey(countOfRegions);
68      t.close();
69    }
70  
71    @AfterClass public static void afterAllTests() throws Exception {
72      TEST_UTIL.shutdownMiniCluster();
73    }
74  
75    @Before public void setup() throws IOException {
76      TEST_UTIL.ensureSomeRegionServersAvailable(2);
77    }
78  
79    /**
80     * Listener for regionserver events testing hbase-2428 (Infinite loop of
81     * region closes if META region is offline).  In particular, listen
82     * for the close of the 'metaServer' and when it comes in, requeue it with a
83     * delay as though there were an issue processing the shutdown.  As part of
84     * the requeuing,  send over a close of a region on 'otherServer' so it comes
85     * into a master that has its meta region marked as offline.
86     */
87    /*
88    static class HBase2428Listener implements RegionServerOperationListener {
89      // Map of what we've delayed so we don't do do repeated delays.
90      private final Set<RegionServerOperation> postponed =
91        new CopyOnWriteArraySet<RegionServerOperation>();
92      private boolean done = false;;
93      private boolean metaShutdownReceived = false;
94      private final HServerAddress metaAddress;
95      private final MiniHBaseCluster cluster;
96      private final int otherServerIndex;
97      private final HRegionInfo hri;
98      private int closeCount = 0;
99      static final int SERVER_DURATION = 3 * 1000;
100     static final int CLOSE_DURATION = 1 * 1000;
101  
102     HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
103         final HRegionInfo closingHRI, final int otherServerIndex) {
104       this.cluster = c;
105       this.metaAddress = metaAddress;
106       this.hri = closingHRI;
107       this.otherServerIndex = otherServerIndex;
108     }
109 
110     @Override
111     public boolean process(final RegionServerOperation op) throws IOException {
112       // If a regionserver shutdown and its of the meta server, then we want to
113       // delay the processing of the shutdown and send off a close of a region on
114       // the 'otherServer.
115       boolean result = true;
116       if (op instanceof ProcessServerShutdown) {
117         ProcessServerShutdown pss = (ProcessServerShutdown)op;
118         if (pss.getDeadServerAddress().equals(this.metaAddress)) {
119           // Don't postpone more than once.
120           if (!this.postponed.contains(pss)) {
121             // Close some region.
122             this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
123               new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
124               Bytes.toBytes("Forcing close in test")));
125             this.postponed.add(pss);
126             // Put off the processing of the regionserver shutdown processing.
127             pss.setDelay(SERVER_DURATION);
128             this.metaShutdownReceived = true;
129             // Return false.  This will add this op to the delayed queue.
130             result = false;
131           }
132         }
133       } else {
134         // Have the close run frequently.
135         if (isWantedCloseOperation(op) != null) {
136           op.setDelay(CLOSE_DURATION);
137           // Count how many times it comes through here.
138           this.closeCount++;
139         }
140       }
141       return result;
142     }
143 
144     public void processed(final RegionServerOperation op) {
145       if (isWantedCloseOperation(op) != null) return;
146       this.done = true;
147     }
148 */
149     /*
150      * @param op
151      * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
152      * cast as a ProcessRegionClose.
153      */
154   /*
155     private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
156       // Count every time we get a close operation.
157       if (op instanceof ProcessRegionClose) {
158         ProcessRegionClose c = (ProcessRegionClose)op;
159         if (c.regionInfo.equals(hri)) {
160           return c;
161         }
162       }
163       return null;
164     }
165 
166     boolean isDone() {
167       return this.done;
168     }
169 
170     boolean isMetaShutdownReceived() {
171       return metaShutdownReceived;
172     }
173 
174     int getCloseCount() {
175       return this.closeCount;
176     }
177 
178     @Override
179     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
180       return true;
181     }
182   }
183 */
184   /**
185    * In 2428, the meta region has just been set offline and then a close comes
186    * in.
187    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 
188    */
189   @Ignore @Test  (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
190   throws Exception {
191     /*
192     LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
193     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
194     final HMaster master = cluster.getMaster();
195     int metaIndex = cluster.getServerWithMeta();
196     // Figure the index of the server that is not server the .META.
197     int otherServerIndex = -1;
198     for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
199       if (i == metaIndex) continue;
200       otherServerIndex = i;
201       break;
202     }
203     final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
204     final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
205 
206     // Get a region out on the otherServer.
207     final HRegionInfo hri =
208       otherServer.getOnlineRegions().iterator().next().getRegionInfo();
209  
210     // Add our RegionServerOperationsListener
211     HBase2428Listener listener = new HBase2428Listener(cluster,
212       metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
213     master.getRegionServerOperationQueue().
214       registerRegionServerOperationListener(listener);
215     try {
216       // Now close the server carrying meta.
217       cluster.abortRegionServer(metaIndex);
218 
219       // First wait on receipt of meta server shutdown message.
220       while(!listener.metaShutdownReceived) Threads.sleep(100);
221       while(!listener.isDone()) Threads.sleep(10);
222       // We should not have retried the close more times than it took for the
223       // server shutdown message to exit the delay queue and get processed
224       // (Multiple by two to add in some slop in case of GC or something).
225       assertTrue(listener.getCloseCount() > 1);
226       assertTrue(listener.getCloseCount() <
227         ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
228 
229       // Assert the closed region came back online
230       assertRegionIsBackOnline(hri);
231     } finally {
232       master.getRegionServerOperationQueue().
233         unregisterRegionServerOperationListener(listener);
234     }
235     */
236   }
237 
238   /**
239    * Test adding in a new server before old one on same host+port is dead.
240    * Make the test more onerous by having the server under test carry the meta.
241    * If confusion between old and new, purportedly meta never comes back.  Test
242    * that meta gets redeployed.
243    */
244   @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
245   throws IOException {
246     /*
247     LOG.info("Running testAddingServerBeforeOldIsDead2413");
248     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
249     int count = count();
250     int metaIndex = cluster.getServerWithMeta();
251     MiniHBaseClusterRegionServer metaHRS =
252       (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
253     int port = metaHRS.getServerInfo().getServerAddress().getPort();
254     Configuration c = TEST_UTIL.getConfiguration();
255     String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
256     try {
257       LOG.info("KILLED=" + metaHRS);
258       metaHRS.kill();
259       c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
260       // Try and start new regionserver.  It might clash with the old
261       // regionserver port so keep trying to get past the BindException.
262       HRegionServer hrs = null;
263       while (true) {
264         try {
265           hrs = cluster.startRegionServer().getRegionServer();
266           break;
267         } catch (IOException e) {
268           if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
269             InvocationTargetException ee = (InvocationTargetException)e.getCause();
270             if (ee.getCause() != null && ee.getCause() instanceof BindException) {
271               LOG.info("BindException; retrying: " + e.toString());
272             }
273           }
274         }
275       }
276       LOG.info("STARTED=" + hrs);
277       // Wait until he's been given at least 3 regions before we go on to try
278       // and count rows in table.
279       while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
280       LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
281         " regions");
282       assertEquals(count, count());
283     } finally {
284       c.set(HConstants.REGIONSERVER_PORT, oldPort);
285     }
286     */
287   }
288 
289   /**
290    * HBase2482 is about outstanding region openings.  If any are outstanding
291    * when a regionserver goes down, then they'll never deploy.  They'll be
292    * stuck in the regions-in-transition list for ever.  This listener looks
293    * for a region opening HMsg and if its from the server passed on construction,
294    * then we kill it.  It also looks out for a close message on the victim
295    * server because that signifies start of the fireworks.
296    */
297   /*
298   static class HBase2482Listener implements RegionServerOperationListener {
299     private final HRegionServer victim;
300     private boolean abortSent = false;
301     // We closed regions on new server.
302     private volatile boolean closed = false;
303     // Copy of regions on new server
304     private final Collection<HRegion> copyOfOnlineRegions;
305     // This is the region that was in transition on the server we aborted. Test
306     // passes if this region comes back online successfully.
307     private HRegionInfo regionToFind;
308 
309     HBase2482Listener(final HRegionServer victim) {
310       this.victim = victim;
311       // Copy regions currently open on this server so I can notice when
312       // there is a close.
313       this.copyOfOnlineRegions =
314         this.victim.getCopyOfOnlineRegionsSortedBySize().values();
315     }
316  
317     @Override
318     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
319       if (!victim.getServerInfo().equals(serverInfo) ||
320           this.abortSent || !this.closed) {
321         return true;
322       }
323       if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
324       // Save the region that is in transition so can test later it came back.
325       this.regionToFind = incomingMsg.getRegionInfo();
326       String msg = "ABORTING " + this.victim + " because got a " +
327         HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
328         incomingMsg.getRegionInfo().getRegionNameAsString();
329       this.victim.abort(msg);
330       this.abortSent = true;
331       return true;
332     }
333 
334     @Override
335     public boolean process(RegionServerOperation op) throws IOException {
336       return true;
337     }
338 
339     @Override
340     public void processed(RegionServerOperation op) {
341       if (this.closed || !(op instanceof ProcessRegionClose)) return;
342       ProcessRegionClose close = (ProcessRegionClose)op;
343       for (HRegion r: this.copyOfOnlineRegions) {
344         if (r.getRegionInfo().equals(close.regionInfo)) {
345           // We've closed one of the regions that was on the victim server.
346           // Now can start testing for when all regions are back online again
347           LOG.info("Found close of " +
348             r.getRegionInfo().getRegionNameAsString() +
349             "; setting close happened flag");
350           this.closed = true;
351           break;
352         }
353       }
354     }
355   }
356 */
357   /**
358    * In 2482, a RS with an opening region on it dies.  The said region is then
359    * stuck in the master's regions-in-transition and never leaves it.  This
360    * test works by bringing up a new regionserver, waiting for the load
361    * balancer to give it some regions.  Then, we close all on the new server.
362    * After sending all the close messages, we send the new regionserver the
363    * special blocking message so it can not process any more messages.
364    * Meantime reopening of the just-closed regions is backed up on the new
365    * server.  Soon as master gets an opening region from the new regionserver,
366    * we kill it.  We then wait on all regions to come back on line.  If bug
367    * is fixed, this should happen soon as the processing of the killed server is
368    * done.
369    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 
370    */
371   @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
372   throws Exception {
373     /*
374     LOG.info("Running testKillRSWithOpeningRegion2482");
375     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
376     if (cluster.getLiveRegionServerThreads().size() < 2) {
377       // Need at least two servers.
378       cluster.startRegionServer();
379     }
380     // Count how many regions are online.  They need to be all back online for
381     // this test to succeed.
382     int countOfMetaRegions = countOfMetaRegions();
383     // Add a listener on the server.
384     HMaster m = cluster.getMaster();
385     // Start new regionserver.
386     MiniHBaseClusterRegionServer hrs =
387       (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
388     LOG.info("Started new regionserver: " + hrs.toString());
389     // Wait until has some regions before proceeding.  Balancer will give it some.
390     int minimumRegions =
391       countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
392     while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
393     // Set the listener only after some regions have been opened on new server.
394     HBase2482Listener listener = new HBase2482Listener(hrs);
395     m.getRegionServerOperationQueue().
396       registerRegionServerOperationListener(listener);
397     try {
398       // Go close all non-catalog regions on this new server
399       closeAllNonCatalogRegions(cluster, hrs);
400       // After all closes, add blocking message before the region opens start to
401       // come in.
402       cluster.addMessageToSendRegionServer(hrs,
403         new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
404       // Wait till one of the above close messages has an effect before we start
405       // wait on all regions back online.
406       while (!listener.closed) Threads.sleep(100);
407       LOG.info("Past close");
408       // Make sure the abort server message was sent.
409       while(!listener.abortSent) Threads.sleep(100);
410       LOG.info("Past abort send; waiting on all regions to redeploy");
411       // Now wait for regions to come back online.
412       assertRegionIsBackOnline(listener.regionToFind);
413     } finally {
414       m.getRegionServerOperationQueue().
415         unregisterRegionServerOperationListener(listener);
416     }
417     */
418   }
419 
420   /*
421    * @return Count of all non-catalog regions on the designated server
422    */
423 /*
424   private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
425     final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
426   throws IOException {
427     int countOfRegions = 0;
428     for (HRegion r: hrs.getOnlineRegions()) {
429       if (r.getRegionInfo().isMetaRegion()) continue;
430       cluster.addMessageToSendRegionServer(hrs,
431         new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
432       LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
433         " on " + hrs.toString());
434       countOfRegions++;
435     }
436     return countOfRegions;
437   }
438 
439   private void assertRegionIsBackOnline(final HRegionInfo hri)
440   throws IOException {
441     // Region should have an entry in its startkey because of addRowToEachRegion.
442     byte [] row = getStartKey(hri);
443     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
444     Get g =  new Get(row);
445     assertTrue((t.get(g)).size() > 0);
446   }
447 
448   /*
449    * @return Count of regions in meta table.
450    * @throws IOException
451    */
452   /*
453   private static int countOfMetaRegions()
454   throws IOException {
455     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
456       HConstants.META_TABLE_NAME);
457     int rows = 0;
458     Scan scan = new Scan();
459     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
460     ResultScanner s = meta.getScanner(scan);
461     for (Result r = null; (r = s.next()) != null;) {
462       byte [] b =
463         r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
464       if (b == null || b.length <= 0) break;
465       rows++;
466     }
467     s.close();
468     return rows;
469   }
470 */
471   /*
472    * Add to each of the regions in .META. a value.  Key is the startrow of the
473    * region (except its 'aaa' for first region).  Actual value is the row name.
474    * @param expected
475    * @return
476    * @throws IOException
477    */
478   private static int addToEachStartKey(final int expected) throws IOException {
479     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
480     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
481         HConstants.META_TABLE_NAME);
482     int rows = 0;
483     Scan scan = new Scan();
484     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
485     ResultScanner s = meta.getScanner(scan);
486     for (Result r = null; (r = s.next()) != null;) {
487       byte [] b =
488         r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
489       if (b == null || b.length <= 0) break;
490       HRegionInfo hri = Writables.getHRegionInfo(b);
491       // If start key, add 'aaa'.
492       byte [] row = getStartKey(hri);
493       Put p = new Put(row);
494       p.setWriteToWAL(false);
495       p.add(getTestFamily(), getTestQualifier(), row);
496       t.put(p);
497       rows++;
498     }
499     s.close();
500     Assert.assertEquals(expected, rows);
501     t.close();
502     meta.close();
503     return rows;
504   }
505 
506   /*
507    * @return Count of rows in TABLENAME
508    * @throws IOException
509    */
510   private static int count() throws IOException {
511     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
512     int rows = 0;
513     Scan scan = new Scan();
514     ResultScanner s = t.getScanner(scan);
515     for (Result r = null; (r = s.next()) != null;) {
516       rows++;
517     }
518     s.close();
519     LOG.info("Counted=" + rows);
520     t.close();
521     return rows;
522   }
523 
524   /*
525    * @param hri
526    * @return Start key for hri (If start key is '', then return 'aaa'.
527    */
528   private static byte [] getStartKey(final HRegionInfo hri) {
529     return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
530         Bytes.toBytes("aaa"): hri.getStartKey();
531   }
532 
533   private static byte [] getTestFamily() {
534     return FAMILIES[0];
535   }
536 
537   private static byte [] getTestQualifier() {
538     return getTestFamily();
539   }
540 
541   @org.junit.Rule
542   public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
543     new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();
544 }
545