1 /** 2 * Copyright 2010 The Apache Software Foundation 3 * 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 package org.apache.hadoop.hbase.master; 21 22 import java.io.IOException; 23 24 import org.apache.commons.logging.Log; 25 import org.apache.commons.logging.LogFactory; 26 import org.apache.hadoop.hbase.*; 27 import org.apache.hadoop.hbase.client.HTable; 28 import org.apache.hadoop.hbase.client.Put; 29 import org.apache.hadoop.hbase.client.Result; 30 import org.apache.hadoop.hbase.client.ResultScanner; 31 import org.apache.hadoop.hbase.client.Scan; 32 import org.apache.hadoop.hbase.util.Bytes; 33 import org.apache.hadoop.hbase.util.Writables; 34 import org.junit.AfterClass; 35 import org.junit.Assert; 36 import org.junit.Before; 37 import org.junit.BeforeClass; 38 import org.junit.Ignore; 39 import org.junit.Test; 40 import org.junit.experimental.categories.Category; 41 42 /** 43 * Test transitions of state across the master. Sets up the cluster once and 44 * then runs a couple of tests. 45 */ 46 @Category(LargeTests.class) 47 public class TestMasterTransitions { 48 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); 49 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 50 private static final String TABLENAME = "master_transitions"; 51 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), 52 Bytes.toBytes("b"), Bytes.toBytes("c")}; 53 54 /** 55 * Start up a mini cluster and put a small table of many empty regions into it. 56 * @throws Exception 57 */ 58 @BeforeClass public static void beforeAllTests() throws Exception { 59 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); 60 TEST_UTIL.startMiniCluster(2); 61 // Create a table of three families. This will assign a region. 62 TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES); 63 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 64 int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily()); 65 TEST_UTIL.waitUntilAllRegionsAssigned(countOfRegions); 66 addToEachStartKey(countOfRegions); 67 t.close(); 68 } 69 70 @AfterClass public static void afterAllTests() throws Exception { 71 TEST_UTIL.shutdownMiniCluster(); 72 } 73 74 @Before public void setup() throws IOException { 75 TEST_UTIL.ensureSomeRegionServersAvailable(2); 76 } 77 78 /** 79 * Listener for regionserver events testing hbase-2428 (Infinite loop of 80 * region closes if META region is offline). In particular, listen 81 * for the close of the 'metaServer' and when it comes in, requeue it with a 82 * delay as though there were an issue processing the shutdown. As part of 83 * the requeuing, send over a close of a region on 'otherServer' so it comes 84 * into a master that has its meta region marked as offline. 85 */ 86 /* 87 static class HBase2428Listener implements RegionServerOperationListener { 88 // Map of what we've delayed so we don't do do repeated delays. 89 private final Set<RegionServerOperation> postponed = 90 new CopyOnWriteArraySet<RegionServerOperation>(); 91 private boolean done = false;; 92 private boolean metaShutdownReceived = false; 93 private final HServerAddress metaAddress; 94 private final MiniHBaseCluster cluster; 95 private final int otherServerIndex; 96 private final HRegionInfo hri; 97 private int closeCount = 0; 98 static final int SERVER_DURATION = 3 * 1000; 99 static final int CLOSE_DURATION = 1 * 1000; 100 101 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, 102 final HRegionInfo closingHRI, final int otherServerIndex) { 103 this.cluster = c; 104 this.metaAddress = metaAddress; 105 this.hri = closingHRI; 106 this.otherServerIndex = otherServerIndex; 107 } 108 109 @Override 110 public boolean process(final RegionServerOperation op) throws IOException { 111 // If a regionserver shutdown and its of the meta server, then we want to 112 // delay the processing of the shutdown and send off a close of a region on 113 // the 'otherServer. 114 boolean result = true; 115 if (op instanceof ProcessServerShutdown) { 116 ProcessServerShutdown pss = (ProcessServerShutdown)op; 117 if (pss.getDeadServerAddress().equals(this.metaAddress)) { 118 // Don't postpone more than once. 119 if (!this.postponed.contains(pss)) { 120 // Close some region. 121 this.cluster.addMessageToSendRegionServer(this.otherServerIndex, 122 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, 123 Bytes.toBytes("Forcing close in test"))); 124 this.postponed.add(pss); 125 // Put off the processing of the regionserver shutdown processing. 126 pss.setDelay(SERVER_DURATION); 127 this.metaShutdownReceived = true; 128 // Return false. This will add this op to the delayed queue. 129 result = false; 130 } 131 } 132 } else { 133 // Have the close run frequently. 134 if (isWantedCloseOperation(op) != null) { 135 op.setDelay(CLOSE_DURATION); 136 // Count how many times it comes through here. 137 this.closeCount++; 138 } 139 } 140 return result; 141 } 142 143 public void processed(final RegionServerOperation op) { 144 if (isWantedCloseOperation(op) != null) return; 145 this.done = true; 146 } 147 */ 148 /* 149 * @param op 150 * @return Null if not the wanted ProcessRegionClose, else <code>op</code> 151 * cast as a ProcessRegionClose. 152 */ 153 /* 154 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { 155 // Count every time we get a close operation. 156 if (op instanceof ProcessRegionClose) { 157 ProcessRegionClose c = (ProcessRegionClose)op; 158 if (c.regionInfo.equals(hri)) { 159 return c; 160 } 161 } 162 return null; 163 } 164 165 boolean isDone() { 166 return this.done; 167 } 168 169 boolean isMetaShutdownReceived() { 170 return metaShutdownReceived; 171 } 172 173 int getCloseCount() { 174 return this.closeCount; 175 } 176 177 @Override 178 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 179 return true; 180 } 181 } 182 */ 183 /** 184 * In 2428, the meta region has just been set offline and then a close comes 185 * in. 186 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 187 */ 188 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() 189 throws Exception { 190 /* 191 LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); 192 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 193 final HMaster master = cluster.getMaster(); 194 int metaIndex = cluster.getServerWithMeta(); 195 // Figure the index of the server that is not server the .META. 196 int otherServerIndex = -1; 197 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { 198 if (i == metaIndex) continue; 199 otherServerIndex = i; 200 break; 201 } 202 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); 203 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); 204 205 // Get a region out on the otherServer. 206 final HRegionInfo hri = 207 otherServer.getOnlineRegions().iterator().next().getRegionInfo(); 208 209 // Add our RegionServerOperationsListener 210 HBase2428Listener listener = new HBase2428Listener(cluster, 211 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 212 master.getRegionServerOperationQueue(). 213 registerRegionServerOperationListener(listener); 214 try { 215 // Now close the server carrying meta. 216 cluster.abortRegionServer(metaIndex); 217 218 // First wait on receipt of meta server shutdown message. 219 while(!listener.metaShutdownReceived) Threads.sleep(100); 220 while(!listener.isDone()) Threads.sleep(10); 221 // We should not have retried the close more times than it took for the 222 // server shutdown message to exit the delay queue and get processed 223 // (Multiple by two to add in some slop in case of GC or something). 224 assertTrue(listener.getCloseCount() > 1); 225 assertTrue(listener.getCloseCount() < 226 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); 227 228 // Assert the closed region came back online 229 assertRegionIsBackOnline(hri); 230 } finally { 231 master.getRegionServerOperationQueue(). 232 unregisterRegionServerOperationListener(listener); 233 } 234 */ 235 } 236 237 /** 238 * Test adding in a new server before old one on same host+port is dead. 239 * Make the test more onerous by having the server under test carry the meta. 240 * If confusion between old and new, purportedly meta never comes back. Test 241 * that meta gets redeployed. 242 */ 243 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() 244 throws IOException { 245 /* 246 LOG.info("Running testAddingServerBeforeOldIsDead2413"); 247 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 248 int count = count(); 249 int metaIndex = cluster.getServerWithMeta(); 250 MiniHBaseClusterRegionServer metaHRS = 251 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); 252 int port = metaHRS.getServerInfo().getServerAddress().getPort(); 253 Configuration c = TEST_UTIL.getConfiguration(); 254 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); 255 try { 256 LOG.info("KILLED=" + metaHRS); 257 metaHRS.kill(); 258 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); 259 // Try and start new regionserver. It might clash with the old 260 // regionserver port so keep trying to get past the BindException. 261 HRegionServer hrs = null; 262 while (true) { 263 try { 264 hrs = cluster.startRegionServer().getRegionServer(); 265 break; 266 } catch (IOException e) { 267 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { 268 InvocationTargetException ee = (InvocationTargetException)e.getCause(); 269 if (ee.getCause() != null && ee.getCause() instanceof BindException) { 270 LOG.info("BindException; retrying: " + e.toString()); 271 } 272 } 273 } 274 } 275 LOG.info("STARTED=" + hrs); 276 // Wait until he's been given at least 3 regions before we go on to try 277 // and count rows in table. 278 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); 279 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 280 " regions"); 281 assertEquals(count, count()); 282 } finally { 283 c.set(HConstants.REGIONSERVER_PORT, oldPort); 284 } 285 */ 286 } 287 288 /** 289 * HBase2482 is about outstanding region openings. If any are outstanding 290 * when a regionserver goes down, then they'll never deploy. They'll be 291 * stuck in the regions-in-transition list for ever. This listener looks 292 * for a region opening HMsg and if its from the server passed on construction, 293 * then we kill it. It also looks out for a close message on the victim 294 * server because that signifies start of the fireworks. 295 */ 296 /* 297 static class HBase2482Listener implements RegionServerOperationListener { 298 private final HRegionServer victim; 299 private boolean abortSent = false; 300 // We closed regions on new server. 301 private volatile boolean closed = false; 302 // Copy of regions on new server 303 private final Collection<HRegion> copyOfOnlineRegions; 304 // This is the region that was in transition on the server we aborted. Test 305 // passes if this region comes back online successfully. 306 private HRegionInfo regionToFind; 307 308 HBase2482Listener(final HRegionServer victim) { 309 this.victim = victim; 310 // Copy regions currently open on this server so I can notice when 311 // there is a close. 312 this.copyOfOnlineRegions = 313 this.victim.getCopyOfOnlineRegionsSortedBySize().values(); 314 } 315 316 @Override 317 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 318 if (!victim.getServerInfo().equals(serverInfo) || 319 this.abortSent || !this.closed) { 320 return true; 321 } 322 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; 323 // Save the region that is in transition so can test later it came back. 324 this.regionToFind = incomingMsg.getRegionInfo(); 325 String msg = "ABORTING " + this.victim + " because got a " + 326 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 327 incomingMsg.getRegionInfo().getRegionNameAsString(); 328 this.victim.abort(msg); 329 this.abortSent = true; 330 return true; 331 } 332 333 @Override 334 public boolean process(RegionServerOperation op) throws IOException { 335 return true; 336 } 337 338 @Override 339 public void processed(RegionServerOperation op) { 340 if (this.closed || !(op instanceof ProcessRegionClose)) return; 341 ProcessRegionClose close = (ProcessRegionClose)op; 342 for (HRegion r: this.copyOfOnlineRegions) { 343 if (r.getRegionInfo().equals(close.regionInfo)) { 344 // We've closed one of the regions that was on the victim server. 345 // Now can start testing for when all regions are back online again 346 LOG.info("Found close of " + 347 r.getRegionInfo().getRegionNameAsString() + 348 "; setting close happened flag"); 349 this.closed = true; 350 break; 351 } 352 } 353 } 354 } 355 */ 356 /** 357 * In 2482, a RS with an opening region on it dies. The said region is then 358 * stuck in the master's regions-in-transition and never leaves it. This 359 * test works by bringing up a new regionserver, waiting for the load 360 * balancer to give it some regions. Then, we close all on the new server. 361 * After sending all the close messages, we send the new regionserver the 362 * special blocking message so it can not process any more messages. 363 * Meantime reopening of the just-closed regions is backed up on the new 364 * server. Soon as master gets an opening region from the new regionserver, 365 * we kill it. We then wait on all regions to come back on line. If bug 366 * is fixed, this should happen soon as the processing of the killed server is 367 * done. 368 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 369 */ 370 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() 371 throws Exception { 372 /* 373 LOG.info("Running testKillRSWithOpeningRegion2482"); 374 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 375 if (cluster.getLiveRegionServerThreads().size() < 2) { 376 // Need at least two servers. 377 cluster.startRegionServer(); 378 } 379 // Count how many regions are online. They need to be all back online for 380 // this test to succeed. 381 int countOfMetaRegions = countOfMetaRegions(); 382 // Add a listener on the server. 383 HMaster m = cluster.getMaster(); 384 // Start new regionserver. 385 MiniHBaseClusterRegionServer hrs = 386 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 387 LOG.info("Started new regionserver: " + hrs.toString()); 388 // Wait until has some regions before proceeding. Balancer will give it some. 389 int minimumRegions = 390 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); 391 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); 392 // Set the listener only after some regions have been opened on new server. 393 HBase2482Listener listener = new HBase2482Listener(hrs); 394 m.getRegionServerOperationQueue(). 395 registerRegionServerOperationListener(listener); 396 try { 397 // Go close all non-catalog regions on this new server 398 closeAllNonCatalogRegions(cluster, hrs); 399 // After all closes, add blocking message before the region opens start to 400 // come in. 401 cluster.addMessageToSendRegionServer(hrs, 402 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); 403 // Wait till one of the above close messages has an effect before we start 404 // wait on all regions back online. 405 while (!listener.closed) Threads.sleep(100); 406 LOG.info("Past close"); 407 // Make sure the abort server message was sent. 408 while(!listener.abortSent) Threads.sleep(100); 409 LOG.info("Past abort send; waiting on all regions to redeploy"); 410 // Now wait for regions to come back online. 411 assertRegionIsBackOnline(listener.regionToFind); 412 } finally { 413 m.getRegionServerOperationQueue(). 414 unregisterRegionServerOperationListener(listener); 415 } 416 */ 417 } 418 419 /* 420 * @return Count of all non-catalog regions on the designated server 421 */ 422 /* 423 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, 424 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) 425 throws IOException { 426 int countOfRegions = 0; 427 for (HRegion r: hrs.getOnlineRegions()) { 428 if (r.getRegionInfo().isMetaRegion()) continue; 429 cluster.addMessageToSendRegionServer(hrs, 430 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); 431 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 432 " on " + hrs.toString()); 433 countOfRegions++; 434 } 435 return countOfRegions; 436 } 437 438 private void assertRegionIsBackOnline(final HRegionInfo hri) 439 throws IOException { 440 // Region should have an entry in its startkey because of addRowToEachRegion. 441 byte [] row = getStartKey(hri); 442 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 443 Get g = new Get(row); 444 assertTrue((t.get(g)).size() > 0); 445 } 446 447 /* 448 * @return Count of regions in meta table. 449 * @throws IOException 450 */ 451 /* 452 private static int countOfMetaRegions() 453 throws IOException { 454 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 455 HConstants.META_TABLE_NAME); 456 int rows = 0; 457 Scan scan = new Scan(); 458 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 459 ResultScanner s = meta.getScanner(scan); 460 for (Result r = null; (r = s.next()) != null;) { 461 byte [] b = 462 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 463 if (b == null || b.length <= 0) break; 464 rows++; 465 } 466 s.close(); 467 return rows; 468 } 469 */ 470 /* 471 * Add to each of the regions in .META. a value. Key is the startrow of the 472 * region (except its 'aaa' for first region). Actual value is the row name. 473 * @param expected 474 * @return 475 * @throws IOException 476 */ 477 private static int addToEachStartKey(final int expected) throws IOException { 478 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 479 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 480 HConstants.META_TABLE_NAME); 481 int rows = 0; 482 Scan scan = new Scan(); 483 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 484 ResultScanner s = meta.getScanner(scan); 485 for (Result r = null; (r = s.next()) != null;) { 486 byte [] b = 487 r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 488 if (b == null || b.length <= 0) break; 489 HRegionInfo hri = Writables.getHRegionInfo(b); 490 // If start key, add 'aaa'. 491 byte [] row = getStartKey(hri); 492 Put p = new Put(row); 493 p.setWriteToWAL(false); 494 p.add(getTestFamily(), getTestQualifier(), row); 495 t.put(p); 496 rows++; 497 } 498 s.close(); 499 Assert.assertEquals(expected, rows); 500 t.close(); 501 meta.close(); 502 return rows; 503 } 504 505 /* 506 * @return Count of rows in TABLENAME 507 * @throws IOException 508 */ 509 private static int count() throws IOException { 510 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 511 int rows = 0; 512 Scan scan = new Scan(); 513 ResultScanner s = t.getScanner(scan); 514 for (Result r = null; (r = s.next()) != null;) { 515 rows++; 516 } 517 s.close(); 518 LOG.info("Counted=" + rows); 519 t.close(); 520 return rows; 521 } 522 523 /* 524 * @param hri 525 * @return Start key for hri (If start key is '', then return 'aaa'. 526 */ 527 private static byte [] getStartKey(final HRegionInfo hri) { 528 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? 529 Bytes.toBytes("aaa"): hri.getStartKey(); 530 } 531 532 private static byte [] getTestFamily() { 533 return FAMILIES[0]; 534 } 535 536 private static byte [] getTestQualifier() { 537 return getTestFamily(); 538 } 539 540 @org.junit.Rule 541 public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu = 542 new org.apache.hadoop.hbase.ResourceCheckerJUnitRule(); 543 } 544