1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 package org.apache.hadoop.hbase.catalog; 19 20 import org.apache.commons.logging.Log; 21 import org.apache.commons.logging.LogFactory; 22 import org.apache.hadoop.classification.InterfaceAudience; 23 import org.apache.hadoop.conf.Configuration; 24 import org.apache.hadoop.hbase.Abortable; 25 import org.apache.hadoop.hbase.HRegionInfo; 26 import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException; 27 import org.apache.hadoop.hbase.ServerName; 28 import org.apache.hadoop.hbase.client.HConnection; 29 import org.apache.hadoop.hbase.client.HConnectionManager; 30 import org.apache.hadoop.hbase.client.HTable; 31 import org.apache.hadoop.hbase.client.RetriesExhaustedException; 32 import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; 33 import org.apache.hadoop.hbase.protobuf.ProtobufUtil; 34 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService; 35 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException; 36 import org.apache.hadoop.hbase.util.Bytes; 37 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker; 38 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; 39 import org.apache.hadoop.ipc.RemoteException; 40 41 import java.io.EOFException; 42 import java.io.IOException; 43 import java.net.ConnectException; 44 import java.net.NoRouteToHostException; 45 import java.net.SocketException; 46 import java.net.SocketTimeoutException; 47 import java.net.UnknownHostException; 48 49 /** 50 * Tracks the availability of the catalog tables 51 * <code>hbase:meta</code>. 52 * 53 * This class is "read-only" in that the locations of the catalog tables cannot 54 * be explicitly set. Instead, ZooKeeper is used to learn of the availability 55 * and location of <code>hbase:meta</code>. 56 * 57 * <p>Call {@link #start()} to start up operation. Call {@link #stop()}} to 58 * interrupt waits and close up shop. 59 */ 60 @InterfaceAudience.Private 61 public class CatalogTracker { 62 // TODO JDC 11/30 We don't even have ROOT anymore, revisit 63 // TODO: This class needs a rethink. The original intent was that it would be 64 // the one-stop-shop for meta locations and that it would get this 65 // info from reading and watching zk state. The class was to be used by 66 // servers when they needed to know of meta movement but also by 67 // client-side (inside in HTable) so rather than figure meta 68 // locations on fault, the client would instead get notifications out of zk. 69 // 70 // But this original intent is frustrated by the fact that this class has to 71 // read an hbase table, the -ROOT- table, to figure out the hbase:meta region 72 // location which means we depend on an HConnection. HConnection will do 73 // retrying but also, it has its own mechanism for finding root and meta 74 // locations (and for 'verifying'; it tries the location and if it fails, does 75 // new lookup, etc.). So, at least for now, HConnection (or HTable) can't 76 // have a CT since CT needs a HConnection (Even then, do want HT to have a CT? 77 // For HT keep up a session with ZK? Rather, shouldn't we do like asynchbase 78 // where we'd open a connection to zk, read what we need then let the 79 // connection go?). The 'fix' is make it so both root and meta addresses 80 // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta). 81 // 82 // But even then, this class does 'verification' of the location and it does 83 // this by making a call over an HConnection (which will do its own root 84 // and meta lookups). Isn't this verification 'useless' since when we 85 // return, whatever is dependent on the result of this call then needs to 86 // use HConnection; what we have verified may change in meantime (HConnection 87 // uses the CT primitives, the root and meta trackers finding root locations). 88 // 89 // When meta is moved to zk, this class may make more sense. In the 90 // meantime, it does not cohere. It should just watch meta and root and not 91 // NOT do verification -- let that be out in HConnection since its going to 92 // be done there ultimately anyways. 93 // 94 // This class has spread throughout the codebase. It needs to be reigned in. 95 // This class should be used server-side only, even if we move meta location 96 // up into zk. Currently its used over in the client package. Its used in 97 // MetaReader and MetaEditor classes usually just to get the Configuration 98 // its using (It does this indirectly by asking its HConnection for its 99 // Configuration and even then this is just used to get an HConnection out on 100 // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for 101 // doing CT fixup. St.Ack 09/30/2011. 102 // 103 104 // TODO: Timeouts have never been as advertised in here and its worse now 105 // with retries; i.e. the HConnection retries and pause goes ahead whatever 106 // the passed timeout is. Fix. 107 private static final Log LOG = LogFactory.getLog(CatalogTracker.class); 108 private final HConnection connection; 109 private final ZooKeeperWatcher zookeeper; 110 private final MetaRegionTracker metaRegionTracker; 111 private boolean instantiatedzkw = false; 112 private Abortable abortable; 113 114 private boolean stopped = false; 115 116 static final byte [] META_REGION_NAME = 117 HRegionInfo.FIRST_META_REGIONINFO.getRegionName(); 118 119 /** 120 * Constructs a catalog tracker. Find current state of catalog tables. 121 * Begin active tracking by executing {@link #start()} post construction. Does 122 * not timeout. 123 * 124 * @param conf 125 * the {@link Configuration} from which a {@link HConnection} will be 126 * obtained; if problem, this connections 127 * {@link HConnection#abort(String, Throwable)} will be called. 128 * @throws IOException 129 */ 130 public CatalogTracker(final Configuration conf) throws IOException { 131 this(null, conf, null); 132 } 133 134 /** 135 * Constructs the catalog tracker. Find current state of catalog tables. 136 * Begin active tracking by executing {@link #start()} post construction. 137 * Does not timeout. 138 * @param zk If zk is null, we'll create an instance (and shut it down 139 * when {@link #stop()} is called) else we'll use what is passed. 140 * @param conf 141 * @param abortable If fatal exception we'll call abort on this. May be null. 142 * If it is we'll use the Connection associated with the passed 143 * {@link Configuration} as our Abortable. 144 * @throws IOException 145 */ 146 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, 147 Abortable abortable) 148 throws IOException { 149 this(zk, conf, HConnectionManager.getConnection(conf), abortable); 150 } 151 152 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, 153 HConnection connection, Abortable abortable) 154 throws IOException { 155 this.connection = connection; 156 if (abortable == null) { 157 // A connection is abortable. 158 this.abortable = this.connection; 159 } 160 Abortable throwableAborter = new Abortable() { 161 162 @Override 163 public void abort(String why, Throwable e) { 164 throw new RuntimeException(why, e); 165 } 166 167 @Override 168 public boolean isAborted() { 169 return true; 170 } 171 172 }; 173 if (zk == null) { 174 // Create our own. Set flag so we tear it down on stop. 175 this.zookeeper = 176 new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(), 177 abortable); 178 instantiatedzkw = true; 179 } else { 180 this.zookeeper = zk; 181 } 182 this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter); 183 } 184 185 /** 186 * Starts the catalog tracker. 187 * Determines current availability of catalog tables and ensures all further 188 * transitions of either region are tracked. 189 * @throws IOException 190 * @throws InterruptedException 191 */ 192 public void start() throws IOException, InterruptedException { 193 LOG.debug("Starting catalog tracker " + this); 194 try { 195 this.metaRegionTracker.start(); 196 } catch (RuntimeException e) { 197 Throwable t = e.getCause(); 198 this.abortable.abort(e.getMessage(), t); 199 throw new IOException("Attempt to start meta tracker failed.", t); 200 } 201 } 202 203 /** 204 * Stop working. 205 * Interrupts any ongoing waits. 206 */ 207 public void stop() { 208 if (!this.stopped) { 209 LOG.debug("Stopping catalog tracker " + this); 210 this.stopped = true; 211 this.metaRegionTracker.stop(); 212 try { 213 if (this.connection != null) { 214 this.connection.close(); 215 } 216 } catch (IOException e) { 217 // Although the {@link Closeable} interface throws an {@link 218 // IOException}, in reality, the implementation would never do that. 219 LOG.error("Attempt to close catalog tracker's connection failed.", e); 220 } 221 if (this.instantiatedzkw) { 222 this.zookeeper.close(); 223 } 224 } 225 } 226 227 /** 228 * Gets the current location for <code>hbase:meta</code> or null if location is 229 * not currently available. 230 * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null 231 * if none available 232 * @throws InterruptedException 233 */ 234 public ServerName getMetaLocation() throws InterruptedException { 235 return this.metaRegionTracker.getMetaRegionLocation(); 236 } 237 238 /** 239 * Checks whether meta regionserver znode has some non null data. 240 * @return true if data is not null, false otherwise. 241 */ 242 public boolean isMetaLocationAvailable() { 243 return this.metaRegionTracker.isLocationAvailable(); 244 } 245 /** 246 * Gets the current location for <code>hbase:meta</code> if available and waits 247 * for up to the specified timeout if not immediately available. Returns null 248 * if the timeout elapses before root is available. 249 * @param timeout maximum time to wait for root availability, in milliseconds 250 * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null 251 * if none available 252 * @throws InterruptedException if interrupted while waiting 253 * @throws NotAllMetaRegionsOnlineException if meta not available before 254 * timeout 255 */ 256 public ServerName waitForMeta(final long timeout) 257 throws InterruptedException, NotAllMetaRegionsOnlineException { 258 ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout); 259 if (sn == null) { 260 throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms"); 261 } 262 return sn; 263 } 264 265 /** 266 * Gets a connection to the server hosting meta, as reported by ZooKeeper, 267 * waiting up to the specified timeout for availability. 268 * @param timeout How long to wait on meta location 269 * @see #waitForMeta for additional information 270 * @return connection to server hosting meta 271 * @throws InterruptedException 272 * @throws NotAllMetaRegionsOnlineException if timed out waiting 273 * @throws IOException 274 * @deprecated Use #getMetaServerConnection(long) 275 */ 276 public AdminService.BlockingInterface waitForMetaServerConnection(long timeout) 277 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException { 278 return getMetaServerConnection(timeout); 279 } 280 281 /** 282 * Gets a connection to the server hosting meta, as reported by ZooKeeper, 283 * waiting up to the specified timeout for availability. 284 * <p>WARNING: Does not retry. Use an {@link HTable} instead. 285 * @param timeout How long to wait on meta location 286 * @see #waitForMeta for additional information 287 * @return connection to server hosting meta 288 * @throws InterruptedException 289 * @throws NotAllMetaRegionsOnlineException if timed out waiting 290 * @throws IOException 291 */ 292 AdminService.BlockingInterface getMetaServerConnection(long timeout) 293 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException { 294 return getCachedConnection(waitForMeta(timeout)); 295 } 296 297 /** 298 * Waits indefinitely for availability of <code>hbase:meta</code>. Used during 299 * cluster startup. Does not verify meta, just that something has been 300 * set up in zk. 301 * @see #waitForMeta(long) 302 * @throws InterruptedException if interrupted while waiting 303 */ 304 public void waitForMeta() throws InterruptedException { 305 while (!this.stopped) { 306 try { 307 if (waitForMeta(100) != null) break; 308 } catch (NotAllMetaRegionsOnlineException e) { 309 if (LOG.isTraceEnabled()) { 310 LOG.trace("hbase:meta still not available, sleeping and retrying." + 311 " Reason: " + e.getMessage()); 312 } 313 } 314 } 315 } 316 317 /** 318 * @param sn ServerName to get a connection against. 319 * @return The AdminProtocol we got when we connected to <code>sn</code> 320 * May have come from cache, may not be good, may have been setup by this 321 * invocation, or may be null. 322 * @throws IOException 323 */ 324 private AdminService.BlockingInterface getCachedConnection(ServerName sn) 325 throws IOException { 326 if (sn == null) { 327 return null; 328 } 329 AdminService.BlockingInterface service = null; 330 try { 331 service = connection.getAdmin(sn); 332 } catch (RetriesExhaustedException e) { 333 if (e.getCause() != null && e.getCause() instanceof ConnectException) { 334 // Catch this; presume it means the cached connection has gone bad. 335 } else { 336 throw e; 337 } 338 } catch (SocketTimeoutException e) { 339 LOG.debug("Timed out connecting to " + sn); 340 } catch (NoRouteToHostException e) { 341 LOG.debug("Connecting to " + sn, e); 342 } catch (SocketException e) { 343 LOG.debug("Exception connecting to " + sn); 344 } catch (UnknownHostException e) { 345 LOG.debug("Unknown host exception connecting to " + sn); 346 } catch (IOException ioe) { 347 Throwable cause = ioe.getCause(); 348 if (ioe instanceof ConnectException) { 349 // Catch. Connect refused. 350 } else if (cause != null && cause instanceof EOFException) { 351 // Catch. Other end disconnected us. 352 } else if (cause != null && cause.getMessage() != null && 353 cause.getMessage().toLowerCase().contains("connection reset")) { 354 // Catch. Connection reset. 355 } else { 356 throw ioe; 357 } 358 359 } 360 return service; 361 } 362 363 /** 364 * Verify we can connect to <code>hostingServer</code> and that its carrying 365 * <code>regionName</code>. 366 * @param hostingServer Interface to the server hosting <code>regionName</code> 367 * @param address The servername that goes with the <code>metaServer</code> 368 * Interface. Used logging. 369 * @param regionName The regionname we are interested in. 370 * @return True if we were able to verify the region located at other side of 371 * the Interface. 372 * @throws IOException 373 */ 374 // TODO: We should be able to get the ServerName from the AdminProtocol 375 // rather than have to pass it in. Its made awkward by the fact that the 376 // HRI is likely a proxy against remote server so the getServerName needs 377 // to be fixed to go to a local method or to a cache before we can do this. 378 private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer, 379 final ServerName address, final byte [] regionName) 380 throws IOException { 381 if (hostingServer == null) { 382 LOG.info("Passed hostingServer is null"); 383 return false; 384 } 385 Throwable t = null; 386 try { 387 // Try and get regioninfo from the hosting server. 388 return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null; 389 } catch (ConnectException e) { 390 t = e; 391 } catch (RetriesExhaustedException e) { 392 t = e; 393 } catch (RemoteException e) { 394 IOException ioe = e.unwrapRemoteException(); 395 t = ioe; 396 } catch (IOException e) { 397 Throwable cause = e.getCause(); 398 if (cause != null && cause instanceof EOFException) { 399 t = cause; 400 } else if (cause != null && cause.getMessage() != null 401 && cause.getMessage().contains("Connection reset")) { 402 t = cause; 403 } else { 404 t = e; 405 } 406 } 407 LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) + 408 " at address=" + address + ", exception=" + t); 409 return false; 410 } 411 412 /** 413 * Verify <code>hbase:meta</code> is deployed and accessible. 414 * @param timeout How long to wait on zk for meta address (passed through to 415 * the internal call to {@link #waitForMetaServerConnection(long)}. 416 * @return True if the <code>hbase:meta</code> location is healthy. 417 * @throws IOException 418 * @throws InterruptedException 419 */ 420 public boolean verifyMetaRegionLocation(final long timeout) 421 throws InterruptedException, IOException { 422 AdminService.BlockingInterface service = null; 423 try { 424 service = waitForMetaServerConnection(timeout); 425 } catch (NotAllMetaRegionsOnlineException e) { 426 // Pass 427 } catch (ServerNotRunningYetException e) { 428 // Pass -- remote server is not up so can't be carrying root 429 } catch (UnknownHostException e) { 430 // Pass -- server name doesn't resolve so it can't be assigned anything. 431 } catch (RegionServerStoppedException e) { 432 // Pass -- server name sends us to a server that is dying or already dead. 433 } 434 return (service == null)? false: 435 verifyRegionLocation(service, 436 this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME); 437 } 438 439 public HConnection getConnection() { 440 return this.connection; 441 } 442 }