1 /** 2 * Copyright 2010 The Apache Software Foundation 3 * 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 package org.apache.hadoop.hbase.master; 21 22 import java.io.IOException; 23 24 import org.apache.commons.logging.Log; 25 import org.apache.commons.logging.LogFactory; 26 import org.apache.hadoop.hbase.*; 27 import org.apache.hadoop.hbase.client.HTable; 28 import org.apache.hadoop.hbase.client.Put; 29 import org.apache.hadoop.hbase.client.Result; 30 import org.apache.hadoop.hbase.client.ResultScanner; 31 import org.apache.hadoop.hbase.client.Scan; 32 import org.apache.hadoop.hbase.util.Bytes; 33 import org.apache.hadoop.hbase.util.Writables; 34 import org.junit.AfterClass; 35 import org.junit.Assert; 36 import org.junit.Before; 37 import org.junit.BeforeClass; 38 import org.junit.Ignore; 39 import org.junit.Test; 40 import org.junit.experimental.categories.Category; 41 42 /** 43 * Test transitions of state across the master. Sets up the cluster once and 44 * then runs a couple of tests. 45 */ 46 @Category(LargeTests.class) 47 public class TestMasterTransitions { 48 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); 49 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 50 private static final String TABLENAME = "master_transitions"; 51 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), 52 Bytes.toBytes("b"), Bytes.toBytes("c")}; 53 54 /** 55 * Start up a mini cluster and put a small table of many empty regions into it. 56 * @throws Exception 57 */ 58 @BeforeClass public static void beforeAllTests() throws Exception { 59 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); 60 TEST_UTIL.startMiniCluster(2); 61 // Create a table of three families. This will assign a region. 62 byte[] tableName = Bytes.toBytes(TABLENAME); 63 TEST_UTIL.createTable(tableName, FAMILIES); 64 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 65 int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily()); 66 TEST_UTIL.waitUntilAllRegionsAssigned(tableName); 67 addToEachStartKey(countOfRegions); 68 t.close(); 69 } 70 71 @AfterClass public static void afterAllTests() throws Exception { 72 TEST_UTIL.shutdownMiniCluster(); 73 } 74 75 @Before public void setup() throws IOException { 76 TEST_UTIL.ensureSomeRegionServersAvailable(2); 77 } 78 79 /** 80 * Listener for regionserver events testing hbase-2428 (Infinite loop of 81 * region closes if META region is offline). In particular, listen 82 * for the close of the 'metaServer' and when it comes in, requeue it with a 83 * delay as though there were an issue processing the shutdown. As part of 84 * the requeuing, send over a close of a region on 'otherServer' so it comes 85 * into a master that has its meta region marked as offline. 86 */ 87 /* 88 static class HBase2428Listener implements RegionServerOperationListener { 89 // Map of what we've delayed so we don't do do repeated delays. 90 private final Set<RegionServerOperation> postponed = 91 new CopyOnWriteArraySet<RegionServerOperation>(); 92 private boolean done = false;; 93 private boolean metaShutdownReceived = false; 94 private final HServerAddress metaAddress; 95 private final MiniHBaseCluster cluster; 96 private final int otherServerIndex; 97 private final HRegionInfo hri; 98 private int closeCount = 0; 99 static final int SERVER_DURATION = 3 * 1000; 100 static final int CLOSE_DURATION = 1 * 1000; 101 102 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, 103 final HRegionInfo closingHRI, final int otherServerIndex) { 104 this.cluster = c; 105 this.metaAddress = metaAddress; 106 this.hri = closingHRI; 107 this.otherServerIndex = otherServerIndex; 108 } 109 110 @Override 111 public boolean process(final RegionServerOperation op) throws IOException { 112 // If a regionserver shutdown and its of the meta server, then we want to 113 // delay the processing of the shutdown and send off a close of a region on 114 // the 'otherServer. 115 boolean result = true; 116 if (op instanceof ProcessServerShutdown) { 117 ProcessServerShutdown pss = (ProcessServerShutdown)op; 118 if (pss.getDeadServerAddress().equals(this.metaAddress)) { 119 // Don't postpone more than once. 120 if (!this.postponed.contains(pss)) { 121 // Close some region. 122 this.cluster.addMessageToSendRegionServer(this.otherServerIndex, 123 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, 124 Bytes.toBytes("Forcing close in test"))); 125 this.postponed.add(pss); 126 // Put off the processing of the regionserver shutdown processing. 127 pss.setDelay(SERVER_DURATION); 128 this.metaShutdownReceived = true; 129 // Return false. This will add this op to the delayed queue. 130 result = false; 131 } 132 } 133 } else { 134 // Have the close run frequently. 135 if (isWantedCloseOperation(op) != null) { 136 op.setDelay(CLOSE_DURATION); 137 // Count how many times it comes through here. 138 this.closeCount++; 139 } 140 } 141 return result; 142 } 143 144 public void processed(final RegionServerOperation op) { 145 if (isWantedCloseOperation(op) != null) return; 146 this.done = true; 147 } 148 */ 149 /* 150 * @param op 151 * @return Null if not the wanted ProcessRegionClose, else <code>op</code> 152 * cast as a ProcessRegionClose. 153 */ 154 /* 155 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { 156 // Count every time we get a close operation. 157 if (op instanceof ProcessRegionClose) { 158 ProcessRegionClose c = (ProcessRegionClose)op; 159 if (c.regionInfo.equals(hri)) { 160 return c; 161 } 162 } 163 return null; 164 } 165 166 boolean isDone() { 167 return this.done; 168 } 169 170 boolean isMetaShutdownReceived() { 171 return metaShutdownReceived; 172 } 173 174 int getCloseCount() { 175 return this.closeCount; 176 } 177 178 @Override 179 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 180 return true; 181 } 182 } 183 */ 184 /** 185 * In 2428, the meta region has just been set offline and then a close comes 186 * in. 187 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 188 */ 189 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() 190 throws Exception { 191 /* 192 LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); 193 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 194 final HMaster master = cluster.getMaster(); 195 int metaIndex = cluster.getServerWithMeta(); 196 // Figure the index of the server that is not server the .META. 197 int otherServerIndex = -1; 198 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { 199 if (i == metaIndex) continue; 200 otherServerIndex = i; 201 break; 202 } 203 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); 204 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); 205 206 // Get a region out on the otherServer. 207 final HRegionInfo hri = 208 otherServer.getOnlineRegions().iterator().next().getRegionInfo(); 209 210 // Add our RegionServerOperationsListener 211 HBase2428Listener listener = new HBase2428Listener(cluster, 212 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 213 master.getRegionServerOperationQueue(). 214 registerRegionServerOperationListener(listener); 215 try { 216 // Now close the server carrying meta. 217 cluster.abortRegionServer(metaIndex); 218 219 // First wait on receipt of meta server shutdown message. 220 while(!listener.metaShutdownReceived) Threads.sleep(100); 221 while(!listener.isDone()) Threads.sleep(10); 222 // We should not have retried the close more times than it took for the 223 // server shutdown message to exit the delay queue and get processed 224 // (Multiple by two to add in some slop in case of GC or something). 225 assertTrue(listener.getCloseCount() > 1); 226 assertTrue(listener.getCloseCount() < 227 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); 228 229 // Assert the closed region came back online 230 assertRegionIsBackOnline(hri); 231 } finally { 232 master.getRegionServerOperationQueue(). 233 unregisterRegionServerOperationListener(listener); 234 } 235 */ 236 } 237 238 /** 239 * Test adding in a new server before old one on same host+port is dead. 240 * Make the test more onerous by having the server under test carry the meta. 241 * If confusion between old and new, purportedly meta never comes back. Test 242 * that meta gets redeployed. 243 */ 244 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() 245 throws IOException { 246 /* 247 LOG.info("Running testAddingServerBeforeOldIsDead2413"); 248 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 249 int count = count(); 250 int metaIndex = cluster.getServerWithMeta(); 251 MiniHBaseClusterRegionServer metaHRS = 252 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); 253 int port = metaHRS.getServerInfo().getServerAddress().getPort(); 254 Configuration c = TEST_UTIL.getConfiguration(); 255 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); 256 try { 257 LOG.info("KILLED=" + metaHRS); 258 metaHRS.kill(); 259 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); 260 // Try and start new regionserver. It might clash with the old 261 // regionserver port so keep trying to get past the BindException. 262 HRegionServer hrs = null; 263 while (true) { 264 try { 265 hrs = cluster.startRegionServer().getRegionServer(); 266 break; 267 } catch (IOException e) { 268 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { 269 InvocationTargetException ee = (InvocationTargetException)e.getCause(); 270 if (ee.getCause() != null && ee.getCause() instanceof BindException) { 271 LOG.info("BindException; retrying: " + e.toString()); 272 } 273 } 274 } 275 } 276 LOG.info("STARTED=" + hrs); 277 // Wait until he's been given at least 3 regions before we go on to try 278 // and count rows in table. 279 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); 280 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 281 " regions"); 282 assertEquals(count, count()); 283 } finally { 284 c.set(HConstants.REGIONSERVER_PORT, oldPort); 285 } 286 */ 287 } 288 289 /** 290 * HBase2482 is about outstanding region openings. If any are outstanding 291 * when a regionserver goes down, then they'll never deploy. They'll be 292 * stuck in the regions-in-transition list for ever. This listener looks 293 * for a region opening HMsg and if its from the server passed on construction, 294 * then we kill it. It also looks out for a close message on the victim 295 * server because that signifies start of the fireworks. 296 */ 297 /* 298 static class HBase2482Listener implements RegionServerOperationListener { 299 private final HRegionServer victim; 300 private boolean abortSent = false; 301 // We closed regions on new server. 302 private volatile boolean closed = false; 303 // Copy of regions on new server 304 private final Collection<HRegion> copyOfOnlineRegions; 305 // This is the region that was in transition on the server we aborted. Test 306 // passes if this region comes back online successfully. 307 private HRegionInfo regionToFind; 308 309 HBase2482Listener(final HRegionServer victim) { 310 this.victim = victim; 311 // Copy regions currently open on this server so I can notice when 312 // there is a close. 313 this.copyOfOnlineRegions = 314 this.victim.getCopyOfOnlineRegionsSortedBySize().values(); 315 } 316 317 @Override 318 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 319 if (!victim.getServerInfo().equals(serverInfo) || 320 this.abortSent || !this.closed) { 321 return true; 322 } 323 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; 324 // Save the region that is in transition so can test later it came back. 325 this.regionToFind = incomingMsg.getRegionInfo(); 326 String msg = "ABORTING " + this.victim + " because got a " + 327 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 328 incomingMsg.getRegionInfo().getRegionNameAsString(); 329 this.victim.abort(msg); 330 this.abortSent = true; 331 return true; 332 } 333 334 @Override 335 public boolean process(RegionServerOperation op) throws IOException { 336 return true; 337 } 338 339 @Override 340 public void processed(RegionServerOperation op) { 341 if (this.closed || !(op instanceof ProcessRegionClose)) return; 342 ProcessRegionClose close = (ProcessRegionClose)op; 343 for (HRegion r: this.copyOfOnlineRegions) { 344 if (r.getRegionInfo().equals(close.regionInfo)) { 345 // We've closed one of the regions that was on the victim server. 346 // Now can start testing for when all regions are back online again 347 LOG.info("Found close of " + 348 r.getRegionInfo().getRegionNameAsString() + 349 "; setting close happened flag"); 350 this.closed = true; 351 break; 352 } 353 } 354 } 355 } 356 */ 357 /** 358 * In 2482, a RS with an opening region on it dies. The said region is then 359 * stuck in the master's regions-in-transition and never leaves it. This 360 * test works by bringing up a new regionserver, waiting for the load 361 * balancer to give it some regions. Then, we close all on the new server. 362 * After sending all the close messages, we send the new regionserver the 363 * special blocking message so it can not process any more messages. 364 * Meantime reopening of the just-closed regions is backed up on the new 365 * server. Soon as master gets an opening region from the new regionserver, 366 * we kill it. We then wait on all regions to come back on line. If bug 367 * is fixed, this should happen soon as the processing of the killed server is 368 * done. 369 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 370 */ 371 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() 372 throws Exception { 373 /* 374 LOG.info("Running testKillRSWithOpeningRegion2482"); 375 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 376 if (cluster.getLiveRegionServerThreads().size() < 2) { 377 // Need at least two servers. 378 cluster.startRegionServer(); 379 } 380 // Count how many regions are online. They need to be all back online for 381 // this test to succeed. 382 int countOfMetaRegions = countOfMetaRegions(); 383 // Add a listener on the server. 384 HMaster m = cluster.getMaster(); 385 // Start new regionserver. 386 MiniHBaseClusterRegionServer hrs = 387 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 388 LOG.info("Started new regionserver: " + hrs.toString()); 389 // Wait until has some regions before proceeding. Balancer will give it some. 390 int minimumRegions = 391 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); 392 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); 393 // Set the listener only after some regions have been opened on new server. 394 HBase2482Listener listener = new HBase2482Listener(hrs); 395 m.getRegionServerOperationQueue(). 396 registerRegionServerOperationListener(listener); 397 try { 398 // Go close all non-catalog regions on this new server 399 closeAllNonCatalogRegions(cluster, hrs); 400 // After all closes, add blocking message before the region opens start to 401 // come in. 402 cluster.addMessageToSendRegionServer(hrs, 403 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); 404 // Wait till one of the above close messages has an effect before we start 405 // wait on all regions back online. 406 while (!listener.closed) Threads.sleep(100); 407 LOG.info("Past close"); 408 // Make sure the abort server message was sent. 409 while(!listener.abortSent) Threads.sleep(100); 410 LOG.info("Past abort send; waiting on all regions to redeploy"); 411 // Now wait for regions to come back online. 412 assertRegionIsBackOnline(listener.regionToFind); 413 } finally { 414 m.getRegionServerOperationQueue(). 415 unregisterRegionServerOperationListener(listener); 416 } 417 */ 418 } 419 420 /* 421 * @return Count of all non-catalog regions on the designated server 422 */ 423 /* 424 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, 425 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) 426 throws IOException { 427 int countOfRegions = 0; 428 for (HRegion r: hrs.getOnlineRegions()) { 429 if (r.getRegionInfo().isMetaRegion()) continue; 430 cluster.addMessageToSendRegionServer(hrs, 431 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); 432 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 433 " on " + hrs.toString()); 434 countOfRegions++; 435 } 436 return countOfRegions; 437 } 438 439 private void assertRegionIsBackOnline(final HRegionInfo hri) 440 throws IOException { 441 // Region should have an entry in its startkey because of addRowToEachRegion. 442 byte [] row = getStartKey(hri); 443 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 444 Get g = new Get(row); 445 assertTrue((t.get(g)).size() > 0); 446 } 447 448 /* 449 * @return Count of regions in meta table. 450 * @throws IOException 451 */ 452 /* 453 private static int countOfMetaRegions() 454 throws IOException { 455 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 456 HConstants.META_TABLE_NAME); 457 int rows = 0; 458 Scan scan = new Scan(); 459 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 460 ResultScanner s = meta.getScanner(scan); 461 for (Result r = null; (r = s.next()) != null;) { 462 byte [] b = 463 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 464 if (b == null || b.length <= 0) break; 465 rows++; 466 } 467 s.close(); 468 return rows; 469 } 470 */ 471 /* 472 * Add to each of the regions in .META. a value. Key is the startrow of the 473 * region (except its 'aaa' for first region). Actual value is the row name. 474 * @param expected 475 * @return 476 * @throws IOException 477 */ 478 private static int addToEachStartKey(final int expected) throws IOException { 479 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 480 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 481 HConstants.META_TABLE_NAME); 482 int rows = 0; 483 Scan scan = new Scan(); 484 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 485 ResultScanner s = meta.getScanner(scan); 486 for (Result r = null; (r = s.next()) != null;) { 487 byte [] b = 488 r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 489 if (b == null || b.length <= 0) break; 490 HRegionInfo hri = Writables.getHRegionInfo(b); 491 // If start key, add 'aaa'. 492 byte [] row = getStartKey(hri); 493 Put p = new Put(row); 494 p.setWriteToWAL(false); 495 p.add(getTestFamily(), getTestQualifier(), row); 496 t.put(p); 497 rows++; 498 } 499 s.close(); 500 Assert.assertEquals(expected, rows); 501 t.close(); 502 meta.close(); 503 return rows; 504 } 505 506 /* 507 * @return Count of rows in TABLENAME 508 * @throws IOException 509 */ 510 private static int count() throws IOException { 511 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 512 int rows = 0; 513 Scan scan = new Scan(); 514 ResultScanner s = t.getScanner(scan); 515 for (Result r = null; (r = s.next()) != null;) { 516 rows++; 517 } 518 s.close(); 519 LOG.info("Counted=" + rows); 520 t.close(); 521 return rows; 522 } 523 524 /* 525 * @param hri 526 * @return Start key for hri (If start key is '', then return 'aaa'. 527 */ 528 private static byte [] getStartKey(final HRegionInfo hri) { 529 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? 530 Bytes.toBytes("aaa"): hri.getStartKey(); 531 } 532 533 private static byte [] getTestFamily() { 534 return FAMILIES[0]; 535 } 536 537 private static byte [] getTestQualifier() { 538 return getTestFamily(); 539 } 540 541 @org.junit.Rule 542 public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu = 543 new org.apache.hadoop.hbase.ResourceCheckerJUnitRule(); 544 } 545