1 /** 2 * Copyright 2010 The Apache Software Foundation 3 * 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 package org.apache.hadoop.hbase.master; 21 22 import java.io.IOException; 23 24 import org.apache.commons.logging.Log; 25 import org.apache.commons.logging.LogFactory; 26 import org.apache.hadoop.hbase.HBaseTestingUtility; 27 import org.apache.hadoop.hbase.HConstants; 28 import org.apache.hadoop.hbase.HRegionInfo; 29 import org.apache.hadoop.hbase.client.HTable; 30 import org.apache.hadoop.hbase.client.Put; 31 import org.apache.hadoop.hbase.client.Result; 32 import org.apache.hadoop.hbase.client.ResultScanner; 33 import org.apache.hadoop.hbase.client.Scan; 34 import org.apache.hadoop.hbase.util.Bytes; 35 import org.apache.hadoop.hbase.util.Writables; 36 import org.junit.AfterClass; 37 import org.junit.Assert; 38 import org.junit.Before; 39 import org.junit.BeforeClass; 40 import org.junit.Ignore; 41 import org.junit.Test; 42 43 /** 44 * Test transitions of state across the master. Sets up the cluster once and 45 * then runs a couple of tests. 46 */ 47 public class TestMasterTransitions { 48 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); 49 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 50 private static final String TABLENAME = "master_transitions"; 51 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), 52 Bytes.toBytes("b"), Bytes.toBytes("c")}; 53 54 /** 55 * Start up a mini cluster and put a small table of many empty regions into it. 56 * @throws Exception 57 */ 58 @BeforeClass public static void beforeAllTests() throws Exception { 59 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); 60 TEST_UTIL.startMiniCluster(2); 61 // Create a table of three families. This will assign a region. 62 TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES); 63 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 64 int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily()); 65 TEST_UTIL.waitUntilAllRegionsAssigned(countOfRegions); 66 addToEachStartKey(countOfRegions); 67 } 68 69 @AfterClass public static void afterAllTests() throws IOException { 70 TEST_UTIL.shutdownMiniCluster(); 71 } 72 73 @Before public void setup() throws IOException { 74 TEST_UTIL.ensureSomeRegionServersAvailable(2); 75 } 76 77 /** 78 * Listener for regionserver events testing hbase-2428 (Infinite loop of 79 * region closes if META region is offline). In particular, listen 80 * for the close of the 'metaServer' and when it comes in, requeue it with a 81 * delay as though there were an issue processing the shutdown. As part of 82 * the requeuing, send over a close of a region on 'otherServer' so it comes 83 * into a master that has its meta region marked as offline. 84 */ 85 /* 86 static class HBase2428Listener implements RegionServerOperationListener { 87 // Map of what we've delayed so we don't do do repeated delays. 88 private final Set<RegionServerOperation> postponed = 89 new CopyOnWriteArraySet<RegionServerOperation>(); 90 private boolean done = false;; 91 private boolean metaShutdownReceived = false; 92 private final HServerAddress metaAddress; 93 private final MiniHBaseCluster cluster; 94 private final int otherServerIndex; 95 private final HRegionInfo hri; 96 private int closeCount = 0; 97 static final int SERVER_DURATION = 3 * 1000; 98 static final int CLOSE_DURATION = 1 * 1000; 99 100 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, 101 final HRegionInfo closingHRI, final int otherServerIndex) { 102 this.cluster = c; 103 this.metaAddress = metaAddress; 104 this.hri = closingHRI; 105 this.otherServerIndex = otherServerIndex; 106 } 107 108 @Override 109 public boolean process(final RegionServerOperation op) throws IOException { 110 // If a regionserver shutdown and its of the meta server, then we want to 111 // delay the processing of the shutdown and send off a close of a region on 112 // the 'otherServer. 113 boolean result = true; 114 if (op instanceof ProcessServerShutdown) { 115 ProcessServerShutdown pss = (ProcessServerShutdown)op; 116 if (pss.getDeadServerAddress().equals(this.metaAddress)) { 117 // Don't postpone more than once. 118 if (!this.postponed.contains(pss)) { 119 // Close some region. 120 this.cluster.addMessageToSendRegionServer(this.otherServerIndex, 121 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, 122 Bytes.toBytes("Forcing close in test"))); 123 this.postponed.add(pss); 124 // Put off the processing of the regionserver shutdown processing. 125 pss.setDelay(SERVER_DURATION); 126 this.metaShutdownReceived = true; 127 // Return false. This will add this op to the delayed queue. 128 result = false; 129 } 130 } 131 } else { 132 // Have the close run frequently. 133 if (isWantedCloseOperation(op) != null) { 134 op.setDelay(CLOSE_DURATION); 135 // Count how many times it comes through here. 136 this.closeCount++; 137 } 138 } 139 return result; 140 } 141 142 public void processed(final RegionServerOperation op) { 143 if (isWantedCloseOperation(op) != null) return; 144 this.done = true; 145 } 146 */ 147 /* 148 * @param op 149 * @return Null if not the wanted ProcessRegionClose, else <code>op</code> 150 * cast as a ProcessRegionClose. 151 */ 152 /* 153 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { 154 // Count every time we get a close operation. 155 if (op instanceof ProcessRegionClose) { 156 ProcessRegionClose c = (ProcessRegionClose)op; 157 if (c.regionInfo.equals(hri)) { 158 return c; 159 } 160 } 161 return null; 162 } 163 164 boolean isDone() { 165 return this.done; 166 } 167 168 boolean isMetaShutdownReceived() { 169 return metaShutdownReceived; 170 } 171 172 int getCloseCount() { 173 return this.closeCount; 174 } 175 176 @Override 177 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 178 return true; 179 } 180 } 181 */ 182 /** 183 * In 2428, the meta region has just been set offline and then a close comes 184 * in. 185 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 186 */ 187 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() 188 throws Exception { 189 /* 190 LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); 191 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 192 final HMaster master = cluster.getMaster(); 193 int metaIndex = cluster.getServerWithMeta(); 194 // Figure the index of the server that is not server the .META. 195 int otherServerIndex = -1; 196 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { 197 if (i == metaIndex) continue; 198 otherServerIndex = i; 199 break; 200 } 201 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); 202 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); 203 204 // Get a region out on the otherServer. 205 final HRegionInfo hri = 206 otherServer.getOnlineRegions().iterator().next().getRegionInfo(); 207 208 // Add our RegionServerOperationsListener 209 HBase2428Listener listener = new HBase2428Listener(cluster, 210 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 211 master.getRegionServerOperationQueue(). 212 registerRegionServerOperationListener(listener); 213 try { 214 // Now close the server carrying meta. 215 cluster.abortRegionServer(metaIndex); 216 217 // First wait on receipt of meta server shutdown message. 218 while(!listener.metaShutdownReceived) Threads.sleep(100); 219 while(!listener.isDone()) Threads.sleep(10); 220 // We should not have retried the close more times than it took for the 221 // server shutdown message to exit the delay queue and get processed 222 // (Multiple by two to add in some slop in case of GC or something). 223 assertTrue(listener.getCloseCount() > 1); 224 assertTrue(listener.getCloseCount() < 225 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); 226 227 // Assert the closed region came back online 228 assertRegionIsBackOnline(hri); 229 } finally { 230 master.getRegionServerOperationQueue(). 231 unregisterRegionServerOperationListener(listener); 232 } 233 */ 234 } 235 236 /** 237 * Test adding in a new server before old one on same host+port is dead. 238 * Make the test more onerous by having the server under test carry the meta. 239 * If confusion between old and new, purportedly meta never comes back. Test 240 * that meta gets redeployed. 241 */ 242 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() 243 throws IOException { 244 /* 245 LOG.info("Running testAddingServerBeforeOldIsDead2413"); 246 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 247 int count = count(); 248 int metaIndex = cluster.getServerWithMeta(); 249 MiniHBaseClusterRegionServer metaHRS = 250 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); 251 int port = metaHRS.getServerInfo().getServerAddress().getPort(); 252 Configuration c = TEST_UTIL.getConfiguration(); 253 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); 254 try { 255 LOG.info("KILLED=" + metaHRS); 256 metaHRS.kill(); 257 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); 258 // Try and start new regionserver. It might clash with the old 259 // regionserver port so keep trying to get past the BindException. 260 HRegionServer hrs = null; 261 while (true) { 262 try { 263 hrs = cluster.startRegionServer().getRegionServer(); 264 break; 265 } catch (IOException e) { 266 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { 267 InvocationTargetException ee = (InvocationTargetException)e.getCause(); 268 if (ee.getCause() != null && ee.getCause() instanceof BindException) { 269 LOG.info("BindException; retrying: " + e.toString()); 270 } 271 } 272 } 273 } 274 LOG.info("STARTED=" + hrs); 275 // Wait until he's been given at least 3 regions before we go on to try 276 // and count rows in table. 277 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); 278 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 279 " regions"); 280 assertEquals(count, count()); 281 } finally { 282 c.set(HConstants.REGIONSERVER_PORT, oldPort); 283 } 284 */ 285 } 286 287 /** 288 * HBase2482 is about outstanding region openings. If any are outstanding 289 * when a regionserver goes down, then they'll never deploy. They'll be 290 * stuck in the regions-in-transition list for ever. This listener looks 291 * for a region opening HMsg and if its from the server passed on construction, 292 * then we kill it. It also looks out for a close message on the victim 293 * server because that signifies start of the fireworks. 294 */ 295 /* 296 static class HBase2482Listener implements RegionServerOperationListener { 297 private final HRegionServer victim; 298 private boolean abortSent = false; 299 // We closed regions on new server. 300 private volatile boolean closed = false; 301 // Copy of regions on new server 302 private final Collection<HRegion> copyOfOnlineRegions; 303 // This is the region that was in transition on the server we aborted. Test 304 // passes if this region comes back online successfully. 305 private HRegionInfo regionToFind; 306 307 HBase2482Listener(final HRegionServer victim) { 308 this.victim = victim; 309 // Copy regions currently open on this server so I can notice when 310 // there is a close. 311 this.copyOfOnlineRegions = 312 this.victim.getCopyOfOnlineRegionsSortedBySize().values(); 313 } 314 315 @Override 316 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 317 if (!victim.getServerInfo().equals(serverInfo) || 318 this.abortSent || !this.closed) { 319 return true; 320 } 321 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; 322 // Save the region that is in transition so can test later it came back. 323 this.regionToFind = incomingMsg.getRegionInfo(); 324 String msg = "ABORTING " + this.victim + " because got a " + 325 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 326 incomingMsg.getRegionInfo().getRegionNameAsString(); 327 this.victim.abort(msg); 328 this.abortSent = true; 329 return true; 330 } 331 332 @Override 333 public boolean process(RegionServerOperation op) throws IOException { 334 return true; 335 } 336 337 @Override 338 public void processed(RegionServerOperation op) { 339 if (this.closed || !(op instanceof ProcessRegionClose)) return; 340 ProcessRegionClose close = (ProcessRegionClose)op; 341 for (HRegion r: this.copyOfOnlineRegions) { 342 if (r.getRegionInfo().equals(close.regionInfo)) { 343 // We've closed one of the regions that was on the victim server. 344 // Now can start testing for when all regions are back online again 345 LOG.info("Found close of " + 346 r.getRegionInfo().getRegionNameAsString() + 347 "; setting close happened flag"); 348 this.closed = true; 349 break; 350 } 351 } 352 } 353 } 354 */ 355 /** 356 * In 2482, a RS with an opening region on it dies. The said region is then 357 * stuck in the master's regions-in-transition and never leaves it. This 358 * test works by bringing up a new regionserver, waiting for the load 359 * balancer to give it some regions. Then, we close all on the new server. 360 * After sending all the close messages, we send the new regionserver the 361 * special blocking message so it can not process any more messages. 362 * Meantime reopening of the just-closed regions is backed up on the new 363 * server. Soon as master gets an opening region from the new regionserver, 364 * we kill it. We then wait on all regions to come back on line. If bug 365 * is fixed, this should happen soon as the processing of the killed server is 366 * done. 367 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 368 */ 369 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() 370 throws Exception { 371 /* 372 LOG.info("Running testKillRSWithOpeningRegion2482"); 373 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 374 if (cluster.getLiveRegionServerThreads().size() < 2) { 375 // Need at least two servers. 376 cluster.startRegionServer(); 377 } 378 // Count how many regions are online. They need to be all back online for 379 // this test to succeed. 380 int countOfMetaRegions = countOfMetaRegions(); 381 // Add a listener on the server. 382 HMaster m = cluster.getMaster(); 383 // Start new regionserver. 384 MiniHBaseClusterRegionServer hrs = 385 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 386 LOG.info("Started new regionserver: " + hrs.toString()); 387 // Wait until has some regions before proceeding. Balancer will give it some. 388 int minimumRegions = 389 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); 390 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); 391 // Set the listener only after some regions have been opened on new server. 392 HBase2482Listener listener = new HBase2482Listener(hrs); 393 m.getRegionServerOperationQueue(). 394 registerRegionServerOperationListener(listener); 395 try { 396 // Go close all non-catalog regions on this new server 397 closeAllNonCatalogRegions(cluster, hrs); 398 // After all closes, add blocking message before the region opens start to 399 // come in. 400 cluster.addMessageToSendRegionServer(hrs, 401 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); 402 // Wait till one of the above close messages has an effect before we start 403 // wait on all regions back online. 404 while (!listener.closed) Threads.sleep(100); 405 LOG.info("Past close"); 406 // Make sure the abort server message was sent. 407 while(!listener.abortSent) Threads.sleep(100); 408 LOG.info("Past abort send; waiting on all regions to redeploy"); 409 // Now wait for regions to come back online. 410 assertRegionIsBackOnline(listener.regionToFind); 411 } finally { 412 m.getRegionServerOperationQueue(). 413 unregisterRegionServerOperationListener(listener); 414 } 415 */ 416 } 417 418 /* 419 * @return Count of all non-catalog regions on the designated server 420 */ 421 /* 422 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, 423 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) 424 throws IOException { 425 int countOfRegions = 0; 426 for (HRegion r: hrs.getOnlineRegions()) { 427 if (r.getRegionInfo().isMetaRegion()) continue; 428 cluster.addMessageToSendRegionServer(hrs, 429 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); 430 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 431 " on " + hrs.toString()); 432 countOfRegions++; 433 } 434 return countOfRegions; 435 } 436 437 private void assertRegionIsBackOnline(final HRegionInfo hri) 438 throws IOException { 439 // Region should have an entry in its startkey because of addRowToEachRegion. 440 byte [] row = getStartKey(hri); 441 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 442 Get g = new Get(row); 443 assertTrue((t.get(g)).size() > 0); 444 } 445 446 /* 447 * @return Count of regions in meta table. 448 * @throws IOException 449 */ 450 /* 451 private static int countOfMetaRegions() 452 throws IOException { 453 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 454 HConstants.META_TABLE_NAME); 455 int rows = 0; 456 Scan scan = new Scan(); 457 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 458 ResultScanner s = meta.getScanner(scan); 459 for (Result r = null; (r = s.next()) != null;) { 460 byte [] b = 461 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 462 if (b == null || b.length <= 0) break; 463 rows++; 464 } 465 s.close(); 466 return rows; 467 } 468 */ 469 /* 470 * Add to each of the regions in .META. a value. Key is the startrow of the 471 * region (except its 'aaa' for first region). Actual value is the row name. 472 * @param expected 473 * @return 474 * @throws IOException 475 */ 476 private static int addToEachStartKey(final int expected) throws IOException { 477 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 478 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 479 HConstants.META_TABLE_NAME); 480 int rows = 0; 481 Scan scan = new Scan(); 482 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 483 ResultScanner s = meta.getScanner(scan); 484 for (Result r = null; (r = s.next()) != null;) { 485 byte [] b = 486 r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 487 if (b == null || b.length <= 0) break; 488 HRegionInfo hri = Writables.getHRegionInfo(b); 489 // If start key, add 'aaa'. 490 byte [] row = getStartKey(hri); 491 Put p = new Put(row); 492 p.add(getTestFamily(), getTestQualifier(), row); 493 t.put(p); 494 rows++; 495 } 496 s.close(); 497 Assert.assertEquals(expected, rows); 498 return rows; 499 } 500 501 /* 502 * @return Count of rows in TABLENAME 503 * @throws IOException 504 */ 505 private static int count() throws IOException { 506 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 507 int rows = 0; 508 Scan scan = new Scan(); 509 ResultScanner s = t.getScanner(scan); 510 for (Result r = null; (r = s.next()) != null;) { 511 rows++; 512 } 513 s.close(); 514 LOG.info("Counted=" + rows); 515 return rows; 516 } 517 518 /* 519 * @param hri 520 * @return Start key for hri (If start key is '', then return 'aaa'. 521 */ 522 private static byte [] getStartKey(final HRegionInfo hri) { 523 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? 524 Bytes.toBytes("aaa"): hri.getStartKey(); 525 } 526 527 private static byte [] getTestFamily() { 528 return FAMILIES[0]; 529 } 530 531 private static byte [] getTestQualifier() { 532 return getTestFamily(); 533 } 534 }