1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 package org.apache.hadoop.hbase.catalog; 19 20 import org.apache.commons.logging.Log; 21 import org.apache.commons.logging.LogFactory; 22 import org.apache.hadoop.classification.InterfaceAudience; 23 import org.apache.hadoop.conf.Configuration; 24 import org.apache.hadoop.hbase.Abortable; 25 import org.apache.hadoop.hbase.HRegionInfo; 26 import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException; 27 import org.apache.hadoop.hbase.ServerName; 28 import org.apache.hadoop.hbase.client.HConnection; 29 import org.apache.hadoop.hbase.client.HConnectionManager; 30 import org.apache.hadoop.hbase.client.HTable; 31 import org.apache.hadoop.hbase.client.RetriesExhaustedException; 32 import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException; 33 import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; 34 import org.apache.hadoop.hbase.protobuf.ProtobufUtil; 35 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService; 36 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException; 37 import org.apache.hadoop.hbase.util.Bytes; 38 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker; 39 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; 40 import org.apache.hadoop.ipc.RemoteException; 41 42 import java.io.EOFException; 43 import java.io.IOException; 44 import java.net.ConnectException; 45 import java.net.NoRouteToHostException; 46 import java.net.SocketException; 47 import java.net.SocketTimeoutException; 48 import java.net.UnknownHostException; 49 50 /** 51 * Tracks the availability of the catalog tables 52 * <code>hbase:meta</code>. 53 * 54 * This class is "read-only" in that the locations of the catalog tables cannot 55 * be explicitly set. Instead, ZooKeeper is used to learn of the availability 56 * and location of <code>hbase:meta</code>. 57 * 58 * <p>Call {@link #start()} to start up operation. Call {@link #stop()}} to 59 * interrupt waits and close up shop. 60 */ 61 @InterfaceAudience.Private 62 public class CatalogTracker { 63 // TODO JDC 11/30 We don't even have ROOT anymore, revisit 64 // TODO: This class needs a rethink. The original intent was that it would be 65 // the one-stop-shop for meta locations and that it would get this 66 // info from reading and watching zk state. The class was to be used by 67 // servers when they needed to know of meta movement but also by 68 // client-side (inside in HTable) so rather than figure meta 69 // locations on fault, the client would instead get notifications out of zk. 70 // 71 // But this original intent is frustrated by the fact that this class has to 72 // read an hbase table, the -ROOT- table, to figure out the hbase:meta region 73 // location which means we depend on an HConnection. HConnection will do 74 // retrying but also, it has its own mechanism for finding root and meta 75 // locations (and for 'verifying'; it tries the location and if it fails, does 76 // new lookup, etc.). So, at least for now, HConnection (or HTable) can't 77 // have a CT since CT needs a HConnection (Even then, do want HT to have a CT? 78 // For HT keep up a session with ZK? Rather, shouldn't we do like asynchbase 79 // where we'd open a connection to zk, read what we need then let the 80 // connection go?). The 'fix' is make it so both root and meta addresses 81 // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta). 82 // 83 // But even then, this class does 'verification' of the location and it does 84 // this by making a call over an HConnection (which will do its own root 85 // and meta lookups). Isn't this verification 'useless' since when we 86 // return, whatever is dependent on the result of this call then needs to 87 // use HConnection; what we have verified may change in meantime (HConnection 88 // uses the CT primitives, the root and meta trackers finding root locations). 89 // 90 // When meta is moved to zk, this class may make more sense. In the 91 // meantime, it does not cohere. It should just watch meta and root and not 92 // NOT do verification -- let that be out in HConnection since its going to 93 // be done there ultimately anyways. 94 // 95 // This class has spread throughout the codebase. It needs to be reigned in. 96 // This class should be used server-side only, even if we move meta location 97 // up into zk. Currently its used over in the client package. Its used in 98 // MetaReader and MetaEditor classes usually just to get the Configuration 99 // its using (It does this indirectly by asking its HConnection for its 100 // Configuration and even then this is just used to get an HConnection out on 101 // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for 102 // doing CT fixup. St.Ack 09/30/2011. 103 // 104 105 // TODO: Timeouts have never been as advertised in here and its worse now 106 // with retries; i.e. the HConnection retries and pause goes ahead whatever 107 // the passed timeout is. Fix. 108 private static final Log LOG = LogFactory.getLog(CatalogTracker.class); 109 private final HConnection connection; 110 private final ZooKeeperWatcher zookeeper; 111 private final MetaRegionTracker metaRegionTracker; 112 private boolean instantiatedzkw = false; 113 private Abortable abortable; 114 115 private boolean stopped = false; 116 117 static final byte [] META_REGION_NAME = 118 HRegionInfo.FIRST_META_REGIONINFO.getRegionName(); 119 120 /** 121 * Constructs a catalog tracker. Find current state of catalog tables. 122 * Begin active tracking by executing {@link #start()} post construction. Does 123 * not timeout. 124 * 125 * @param conf 126 * the {@link Configuration} from which a {@link HConnection} will be 127 * obtained; if problem, this connections 128 * {@link HConnection#abort(String, Throwable)} will be called. 129 * @throws IOException 130 */ 131 public CatalogTracker(final Configuration conf) throws IOException { 132 this(null, conf, null); 133 } 134 135 /** 136 * Constructs the catalog tracker. Find current state of catalog tables. 137 * Begin active tracking by executing {@link #start()} post construction. 138 * Does not timeout. 139 * @param zk If zk is null, we'll create an instance (and shut it down 140 * when {@link #stop()} is called) else we'll use what is passed. 141 * @param conf 142 * @param abortable If fatal exception we'll call abort on this. May be null. 143 * If it is we'll use the Connection associated with the passed 144 * {@link Configuration} as our Abortable. 145 * @throws IOException 146 */ 147 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, 148 Abortable abortable) 149 throws IOException { 150 this(zk, conf, HConnectionManager.getConnection(conf), abortable); 151 } 152 153 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, 154 HConnection connection, Abortable abortable) 155 throws IOException { 156 this.connection = connection; 157 if (abortable == null) { 158 // A connection is abortable. 159 this.abortable = this.connection; 160 } 161 Abortable throwableAborter = new Abortable() { 162 163 @Override 164 public void abort(String why, Throwable e) { 165 throw new RuntimeException(why, e); 166 } 167 168 @Override 169 public boolean isAborted() { 170 return true; 171 } 172 173 }; 174 if (zk == null) { 175 // Create our own. Set flag so we tear it down on stop. 176 this.zookeeper = 177 new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(), 178 abortable); 179 instantiatedzkw = true; 180 } else { 181 this.zookeeper = zk; 182 } 183 this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter); 184 } 185 186 /** 187 * Starts the catalog tracker. 188 * Determines current availability of catalog tables and ensures all further 189 * transitions of either region are tracked. 190 * @throws IOException 191 * @throws InterruptedException 192 */ 193 public void start() throws IOException, InterruptedException { 194 LOG.debug("Starting catalog tracker " + this); 195 try { 196 this.metaRegionTracker.start(); 197 } catch (RuntimeException e) { 198 Throwable t = e.getCause(); 199 this.abortable.abort(e.getMessage(), t); 200 throw new IOException("Attempt to start meta tracker failed.", t); 201 } 202 } 203 204 /** 205 * Stop working. 206 * Interrupts any ongoing waits. 207 */ 208 public void stop() { 209 if (!this.stopped) { 210 LOG.debug("Stopping catalog tracker " + this); 211 this.stopped = true; 212 this.metaRegionTracker.stop(); 213 try { 214 if (this.connection != null) { 215 this.connection.close(); 216 } 217 } catch (IOException e) { 218 // Although the {@link Closeable} interface throws an {@link 219 // IOException}, in reality, the implementation would never do that. 220 LOG.error("Attempt to close catalog tracker's connection failed.", e); 221 } 222 if (this.instantiatedzkw) { 223 this.zookeeper.close(); 224 } 225 } 226 } 227 228 /** 229 * Gets the current location for <code>hbase:meta</code> or null if location is 230 * not currently available. 231 * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null 232 * if none available 233 * @throws InterruptedException 234 */ 235 public ServerName getMetaLocation() throws InterruptedException { 236 return this.metaRegionTracker.getMetaRegionLocation(); 237 } 238 239 /** 240 * Checks whether meta regionserver znode has some non null data. 241 * @return true if data is not null, false otherwise. 242 */ 243 public boolean isMetaLocationAvailable() { 244 return this.metaRegionTracker.isLocationAvailable(); 245 } 246 /** 247 * Gets the current location for <code>hbase:meta</code> if available and waits 248 * for up to the specified timeout if not immediately available. Returns null 249 * if the timeout elapses before root is available. 250 * @param timeout maximum time to wait for root availability, in milliseconds 251 * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null 252 * if none available 253 * @throws InterruptedException if interrupted while waiting 254 * @throws NotAllMetaRegionsOnlineException if meta not available before 255 * timeout 256 */ 257 public ServerName waitForMeta(final long timeout) 258 throws InterruptedException, NotAllMetaRegionsOnlineException { 259 ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout); 260 if (sn == null) { 261 throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms"); 262 } 263 return sn; 264 } 265 266 /** 267 * Gets a connection to the server hosting meta, as reported by ZooKeeper, 268 * waiting up to the specified timeout for availability. 269 * @param timeout How long to wait on meta location 270 * @see #waitForMeta for additional information 271 * @return connection to server hosting meta 272 * @throws InterruptedException 273 * @throws NotAllMetaRegionsOnlineException if timed out waiting 274 * @throws IOException 275 * @deprecated Use #getMetaServerConnection(long) 276 */ 277 public AdminService.BlockingInterface waitForMetaServerConnection(long timeout) 278 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException { 279 return getMetaServerConnection(timeout); 280 } 281 282 /** 283 * Gets a connection to the server hosting meta, as reported by ZooKeeper, 284 * waiting up to the specified timeout for availability. 285 * <p>WARNING: Does not retry. Use an {@link HTable} instead. 286 * @param timeout How long to wait on meta location 287 * @see #waitForMeta for additional information 288 * @return connection to server hosting meta 289 * @throws InterruptedException 290 * @throws NotAllMetaRegionsOnlineException if timed out waiting 291 * @throws IOException 292 */ 293 AdminService.BlockingInterface getMetaServerConnection(long timeout) 294 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException { 295 return getCachedConnection(waitForMeta(timeout)); 296 } 297 298 /** 299 * Waits indefinitely for availability of <code>hbase:meta</code>. Used during 300 * cluster startup. Does not verify meta, just that something has been 301 * set up in zk. 302 * @see #waitForMeta(long) 303 * @throws InterruptedException if interrupted while waiting 304 */ 305 public void waitForMeta() throws InterruptedException { 306 while (!this.stopped) { 307 try { 308 if (waitForMeta(100) != null) break; 309 } catch (NotAllMetaRegionsOnlineException e) { 310 if (LOG.isTraceEnabled()) { 311 LOG.trace("hbase:meta still not available, sleeping and retrying." + 312 " Reason: " + e.getMessage()); 313 } 314 } 315 } 316 } 317 318 /** 319 * @param sn ServerName to get a connection against. 320 * @return The AdminProtocol we got when we connected to <code>sn</code> 321 * May have come from cache, may not be good, may have been setup by this 322 * invocation, or may be null. 323 * @throws IOException 324 */ 325 private AdminService.BlockingInterface getCachedConnection(ServerName sn) 326 throws IOException { 327 if (sn == null) { 328 return null; 329 } 330 AdminService.BlockingInterface service = null; 331 try { 332 service = connection.getAdmin(sn); 333 } catch (RetriesExhaustedException e) { 334 if (e.getCause() != null && e.getCause() instanceof ConnectException) { 335 // Catch this; presume it means the cached connection has gone bad. 336 } else { 337 throw e; 338 } 339 } catch (SocketTimeoutException e) { 340 LOG.debug("Timed out connecting to " + sn); 341 } catch (NoRouteToHostException e) { 342 LOG.debug("Connecting to " + sn, e); 343 } catch (SocketException e) { 344 LOG.debug("Exception connecting to " + sn); 345 } catch (UnknownHostException e) { 346 LOG.debug("Unknown host exception connecting to " + sn); 347 } catch (FailedServerException e) { 348 if (LOG.isDebugEnabled()) { 349 LOG.debug("Server " + sn + " is in failed server list."); 350 } 351 } catch (IOException ioe) { 352 Throwable cause = ioe.getCause(); 353 if (ioe instanceof ConnectException) { 354 // Catch. Connect refused. 355 } else if (cause != null && cause instanceof EOFException) { 356 // Catch. Other end disconnected us. 357 } else if (cause != null && cause.getMessage() != null && 358 cause.getMessage().toLowerCase().contains("connection reset")) { 359 // Catch. Connection reset. 360 } else { 361 throw ioe; 362 } 363 364 } 365 return service; 366 } 367 368 /** 369 * Verify we can connect to <code>hostingServer</code> and that its carrying 370 * <code>regionName</code>. 371 * @param hostingServer Interface to the server hosting <code>regionName</code> 372 * @param address The servername that goes with the <code>metaServer</code> 373 * Interface. Used logging. 374 * @param regionName The regionname we are interested in. 375 * @return True if we were able to verify the region located at other side of 376 * the Interface. 377 * @throws IOException 378 */ 379 // TODO: We should be able to get the ServerName from the AdminProtocol 380 // rather than have to pass it in. Its made awkward by the fact that the 381 // HRI is likely a proxy against remote server so the getServerName needs 382 // to be fixed to go to a local method or to a cache before we can do this. 383 private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer, 384 final ServerName address, final byte [] regionName) 385 throws IOException { 386 if (hostingServer == null) { 387 LOG.info("Passed hostingServer is null"); 388 return false; 389 } 390 Throwable t = null; 391 try { 392 // Try and get regioninfo from the hosting server. 393 return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null; 394 } catch (ConnectException e) { 395 t = e; 396 } catch (RetriesExhaustedException e) { 397 t = e; 398 } catch (RemoteException e) { 399 IOException ioe = e.unwrapRemoteException(); 400 t = ioe; 401 } catch (IOException e) { 402 Throwable cause = e.getCause(); 403 if (cause != null && cause instanceof EOFException) { 404 t = cause; 405 } else if (cause != null && cause.getMessage() != null 406 && cause.getMessage().contains("Connection reset")) { 407 t = cause; 408 } else { 409 t = e; 410 } 411 } 412 LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) + 413 " at address=" + address + ", exception=" + t); 414 return false; 415 } 416 417 /** 418 * Verify <code>hbase:meta</code> is deployed and accessible. 419 * @param timeout How long to wait on zk for meta address (passed through to 420 * the internal call to {@link #waitForMetaServerConnection(long)}. 421 * @return True if the <code>hbase:meta</code> location is healthy. 422 * @throws IOException 423 * @throws InterruptedException 424 */ 425 public boolean verifyMetaRegionLocation(final long timeout) 426 throws InterruptedException, IOException { 427 AdminService.BlockingInterface service = null; 428 try { 429 service = waitForMetaServerConnection(timeout); 430 } catch (NotAllMetaRegionsOnlineException e) { 431 // Pass 432 } catch (ServerNotRunningYetException e) { 433 // Pass -- remote server is not up so can't be carrying root 434 } catch (UnknownHostException e) { 435 // Pass -- server name doesn't resolve so it can't be assigned anything. 436 } catch (RegionServerStoppedException e) { 437 // Pass -- server name sends us to a server that is dying or already dead. 438 } 439 return (service == null)? false: 440 verifyRegionLocation(service, 441 this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME); 442 } 443 444 public HConnection getConnection() { 445 return this.connection; 446 } 447 }