1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 package org.apache.hadoop.hbase.catalog; 19 20 import org.apache.commons.logging.Log; 21 import org.apache.commons.logging.LogFactory; 22 import org.apache.hadoop.classification.InterfaceAudience; 23 import org.apache.hadoop.conf.Configuration; 24 import org.apache.hadoop.hbase.Abortable; 25 import org.apache.hadoop.hbase.HRegionInfo; 26 import org.apache.hadoop.hbase.ServerName; 27 import org.apache.hadoop.hbase.client.AdminProtocol; 28 import org.apache.hadoop.hbase.client.HConnection; 29 import org.apache.hadoop.hbase.client.HConnectionManager; 30 import org.apache.hadoop.hbase.client.HTable; 31 import org.apache.hadoop.hbase.client.RetriesExhaustedException; 32 import org.apache.hadoop.hbase.exceptions.NotAllMetaRegionsOnlineException; 33 import org.apache.hadoop.hbase.exceptions.ServerNotRunningYetException; 34 import org.apache.hadoop.hbase.protobuf.ProtobufUtil; 35 import org.apache.hadoop.hbase.util.Bytes; 36 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker; 37 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; 38 import org.apache.hadoop.ipc.RemoteException; 39 40 import java.io.EOFException; 41 import java.io.IOException; 42 import java.net.ConnectException; 43 import java.net.NoRouteToHostException; 44 import java.net.SocketException; 45 import java.net.SocketTimeoutException; 46 import java.net.UnknownHostException; 47 48 /** 49 * Tracks the availability of the catalog tables 50 * <code>.META.</code>. 51 * 52 * This class is "read-only" in that the locations of the catalog tables cannot 53 * be explicitly set. Instead, ZooKeeper is used to learn of the availability 54 * and location of <code>.META.</code>. 55 * 56 * <p>Call {@link #start()} to start up operation. Call {@link #stop()}} to 57 * interrupt waits and close up shop. 58 */ 59 @InterfaceAudience.Private 60 public class CatalogTracker { 61 // TODO JDC 11/30 We don't even have ROOT anymore, revisit 62 // TODO: This class needs a rethink. The original intent was that it would be 63 // the one-stop-shop for meta locations and that it would get this 64 // info from reading and watching zk state. The class was to be used by 65 // servers when they needed to know of meta movement but also by 66 // client-side (inside in HTable) so rather than figure meta 67 // locations on fault, the client would instead get notifications out of zk. 68 // 69 // But this original intent is frustrated by the fact that this class has to 70 // read an hbase table, the -ROOT- table, to figure out the .META. region 71 // location which means we depend on an HConnection. HConnection will do 72 // retrying but also, it has its own mechanism for finding root and meta 73 // locations (and for 'verifying'; it tries the location and if it fails, does 74 // new lookup, etc.). So, at least for now, HConnection (or HTable) can't 75 // have a CT since CT needs a HConnection (Even then, do want HT to have a CT? 76 // For HT keep up a session with ZK? Rather, shouldn't we do like asynchbase 77 // where we'd open a connection to zk, read what we need then let the 78 // connection go?). The 'fix' is make it so both root and meta addresses 79 // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta). 80 // 81 // But even then, this class does 'verification' of the location and it does 82 // this by making a call over an HConnection (which will do its own root 83 // and meta lookups). Isn't this verification 'useless' since when we 84 // return, whatever is dependent on the result of this call then needs to 85 // use HConnection; what we have verified may change in meantime (HConnection 86 // uses the CT primitives, the root and meta trackers finding root locations). 87 // 88 // When meta is moved to zk, this class may make more sense. In the 89 // meantime, it does not cohere. It should just watch meta and root and not 90 // NOT do verification -- let that be out in HConnection since its going to 91 // be done there ultimately anyways. 92 // 93 // This class has spread throughout the codebase. It needs to be reigned in. 94 // This class should be used server-side only, even if we move meta location 95 // up into zk. Currently its used over in the client package. Its used in 96 // MetaReader and MetaEditor classes usually just to get the Configuration 97 // its using (It does this indirectly by asking its HConnection for its 98 // Configuration and even then this is just used to get an HConnection out on 99 // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for 100 // doing CT fixup. St.Ack 09/30/2011. 101 // 102 103 // TODO: Timeouts have never been as advertised in here and its worse now 104 // with retries; i.e. the HConnection retries and pause goes ahead whatever 105 // the passed timeout is. Fix. 106 private static final Log LOG = LogFactory.getLog(CatalogTracker.class); 107 private final HConnection connection; 108 private final ZooKeeperWatcher zookeeper; 109 private final MetaRegionTracker metaRegionTracker; 110 private boolean instantiatedzkw = false; 111 private Abortable abortable; 112 113 private boolean stopped = false; 114 115 static final byte [] META_REGION_NAME = 116 HRegionInfo.FIRST_META_REGIONINFO.getRegionName(); 117 118 /** 119 * Constructs a catalog tracker. Find current state of catalog tables. 120 * Begin active tracking by executing {@link #start()} post construction. Does 121 * not timeout. 122 * 123 * @param conf 124 * the {@link Configuration} from which a {@link HConnection} will be 125 * obtained; if problem, this connections 126 * {@link HConnection#abort(String, Throwable)} will be called. 127 * @throws IOException 128 */ 129 public CatalogTracker(final Configuration conf) throws IOException { 130 this(null, conf, null); 131 } 132 133 /** 134 * Constructs the catalog tracker. Find current state of catalog tables. 135 * Begin active tracking by executing {@link #start()} post construction. 136 * Does not timeout. 137 * @param zk If zk is null, we'll create an instance (and shut it down 138 * when {@link #stop()} is called) else we'll use what is passed. 139 * @param conf 140 * @param abortable If fatal exception we'll call abort on this. May be null. 141 * If it is we'll use the Connection associated with the passed 142 * {@link Configuration} as our Abortable. 143 * @throws IOException 144 */ 145 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, 146 Abortable abortable) 147 throws IOException { 148 this(zk, conf, HConnectionManager.getConnection(conf), abortable); 149 } 150 151 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, 152 HConnection connection, Abortable abortable) 153 throws IOException { 154 this.connection = connection; 155 if (abortable == null) { 156 // A connection is abortable. 157 this.abortable = this.connection; 158 } 159 Abortable throwableAborter = new Abortable() { 160 161 @Override 162 public void abort(String why, Throwable e) { 163 throw new RuntimeException(why, e); 164 } 165 166 @Override 167 public boolean isAborted() { 168 return true; 169 } 170 171 }; 172 if (zk == null) { 173 // Create our own. Set flag so we tear it down on stop. 174 this.zookeeper = 175 new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(), 176 abortable); 177 instantiatedzkw = true; 178 } else { 179 this.zookeeper = zk; 180 } 181 this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter); 182 } 183 184 /** 185 * Starts the catalog tracker. 186 * Determines current availability of catalog tables and ensures all further 187 * transitions of either region are tracked. 188 * @throws IOException 189 * @throws InterruptedException 190 */ 191 public void start() throws IOException, InterruptedException { 192 LOG.debug("Starting catalog tracker " + this); 193 try { 194 this.metaRegionTracker.start(); 195 } catch (RuntimeException e) { 196 Throwable t = e.getCause(); 197 this.abortable.abort(e.getMessage(), t); 198 throw new IOException("Attempt to start meta tracker failed.", t); 199 } 200 } 201 202 /** 203 * Stop working. 204 * Interrupts any ongoing waits. 205 */ 206 public void stop() { 207 if (!this.stopped) { 208 LOG.debug("Stopping catalog tracker " + this); 209 this.stopped = true; 210 this.metaRegionTracker.stop(); 211 try { 212 if (this.connection != null) { 213 this.connection.close(); 214 } 215 } catch (IOException e) { 216 // Although the {@link Closeable} interface throws an {@link 217 // IOException}, in reality, the implementation would never do that. 218 LOG.error("Attempt to close catalog tracker's connection failed.", e); 219 } 220 if (this.instantiatedzkw) { 221 this.zookeeper.close(); 222 } 223 } 224 } 225 226 /** 227 * Gets the current location for <code>.META.</code> or null if location is 228 * not currently available. 229 * @return {@link ServerName} for server hosting <code>.META.</code> or null 230 * if none available 231 * @throws InterruptedException 232 */ 233 public ServerName getMetaLocation() throws InterruptedException { 234 return this.metaRegionTracker.getMetaRegionLocation(); 235 } 236 237 /** 238 * Gets the current location for <code>.META.</code> if available and waits 239 * for up to the specified timeout if not immediately available. Returns null 240 * if the timeout elapses before root is available. 241 * @param timeout maximum time to wait for root availability, in milliseconds 242 * @return {@link ServerName} for server hosting <code>.META.</code> or null 243 * if none available 244 * @throws InterruptedException if interrupted while waiting 245 * @throws NotAllMetaRegionsOnlineException if meta not available before 246 * timeout 247 */ 248 public ServerName waitForMeta(final long timeout) 249 throws InterruptedException, NotAllMetaRegionsOnlineException { 250 ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout); 251 if (sn == null) { 252 throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms"); 253 } 254 return sn; 255 } 256 257 /** 258 * Gets a connection to the server hosting meta, as reported by ZooKeeper, 259 * waiting up to the specified timeout for availability. 260 * @param timeout How long to wait on meta location 261 * @see #waitForMeta for additional information 262 * @return connection to server hosting meta 263 * @throws InterruptedException 264 * @throws NotAllMetaRegionsOnlineException if timed out waiting 265 * @throws IOException 266 * @deprecated Use #getMetaServerConnection(long) 267 */ 268 public AdminProtocol waitForMetaServerConnection(long timeout) 269 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException { 270 return getMetaServerConnection(timeout); 271 } 272 273 /** 274 * Gets a connection to the server hosting meta, as reported by ZooKeeper, 275 * waiting up to the specified timeout for availability. 276 * <p>WARNING: Does not retry. Use an {@link HTable} instead. 277 * @param timeout How long to wait on meta location 278 * @see #waitForMeta for additional information 279 * @return connection to server hosting meta 280 * @throws InterruptedException 281 * @throws NotAllMetaRegionsOnlineException if timed out waiting 282 * @throws IOException 283 */ 284 AdminProtocol getMetaServerConnection(long timeout) 285 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException { 286 return getCachedConnection(waitForMeta(timeout)); 287 } 288 289 /** 290 * Waits indefinitely for availability of <code>.META.</code>. Used during 291 * cluster startup. Does not verify meta, just that something has been 292 * set up in zk. 293 * @see #waitForMeta(long) 294 * @throws InterruptedException if interrupted while waiting 295 */ 296 public void waitForMeta() throws InterruptedException { 297 while (!this.stopped) { 298 try { 299 if (waitForMeta(100) != null) break; 300 } catch (NotAllMetaRegionsOnlineException e) { 301 if (LOG.isTraceEnabled()) { 302 LOG.info(".META. still not available, sleeping and retrying." + 303 " Reason: " + e.getMessage()); 304 } 305 } 306 } 307 } 308 309 /** 310 * @param sn ServerName to get a connection against. 311 * @return The AdminProtocol we got when we connected to <code>sn</code> 312 * May have come from cache, may not be good, may have been setup by this 313 * invocation, or may be null. 314 * @throws IOException 315 */ 316 private AdminProtocol getCachedConnection(ServerName sn) 317 throws IOException { 318 if (sn == null) { 319 return null; 320 } 321 AdminProtocol protocol = null; 322 try { 323 protocol = connection.getAdmin(sn); 324 } catch (RetriesExhaustedException e) { 325 if (e.getCause() != null && e.getCause() instanceof ConnectException) { 326 // Catch this; presume it means the cached connection has gone bad. 327 } else { 328 throw e; 329 } 330 } catch (SocketTimeoutException e) { 331 LOG.debug("Timed out connecting to " + sn); 332 } catch (NoRouteToHostException e) { 333 LOG.debug("Connecting to " + sn, e); 334 } catch (SocketException e) { 335 LOG.debug("Exception connecting to " + sn); 336 } catch (UnknownHostException e) { 337 LOG.debug("Unknown host exception connecting to " + sn); 338 } catch (IOException ioe) { 339 Throwable cause = ioe.getCause(); 340 if (ioe instanceof ConnectException) { 341 // Catch. Connect refused. 342 } else if (cause != null && cause instanceof EOFException) { 343 // Catch. Other end disconnected us. 344 } else if (cause != null && cause.getMessage() != null && 345 cause.getMessage().toLowerCase().contains("connection reset")) { 346 // Catch. Connection reset. 347 } else { 348 throw ioe; 349 } 350 351 } 352 return protocol; 353 } 354 355 /** 356 * Verify we can connect to <code>hostingServer</code> and that its carrying 357 * <code>regionName</code>. 358 * @param hostingServer Interface to the server hosting <code>regionName</code> 359 * @param address The servername that goes with the <code>metaServer</code> 360 * Interface. Used logging. 361 * @param regionName The regionname we are interested in. 362 * @return True if we were able to verify the region located at other side of 363 * the Interface. 364 * @throws IOException 365 */ 366 // TODO: We should be able to get the ServerName from the AdminProtocol 367 // rather than have to pass it in. Its made awkward by the fact that the 368 // HRI is likely a proxy against remote server so the getServerName needs 369 // to be fixed to go to a local method or to a cache before we can do this. 370 private boolean verifyRegionLocation(AdminProtocol hostingServer, 371 final ServerName address, final byte [] regionName) 372 throws IOException { 373 if (hostingServer == null) { 374 LOG.info("Passed hostingServer is null"); 375 return false; 376 } 377 Throwable t = null; 378 try { 379 // Try and get regioninfo from the hosting server. 380 return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null; 381 } catch (ConnectException e) { 382 t = e; 383 } catch (RetriesExhaustedException e) { 384 t = e; 385 } catch (RemoteException e) { 386 IOException ioe = e.unwrapRemoteException(); 387 t = ioe; 388 } catch (IOException e) { 389 Throwable cause = e.getCause(); 390 if (cause != null && cause instanceof EOFException) { 391 t = cause; 392 } else if (cause != null && cause.getMessage() != null 393 && cause.getMessage().contains("Connection reset")) { 394 t = cause; 395 } else { 396 t = e; 397 } 398 } 399 LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) + 400 " at address=" + address + ", exception=" + t); 401 return false; 402 } 403 404 /** 405 * Verify <code>.META.</code> is deployed and accessible. 406 * @param timeout How long to wait on zk for meta address (passed through to 407 * the internal call to {@link #waitForMetaServerConnection(long)}. 408 * @return True if the <code>.META.</code> location is healthy. 409 * @throws IOException 410 * @throws InterruptedException 411 */ 412 public boolean verifyMetaRegionLocation(final long timeout) 413 throws InterruptedException, IOException { 414 AdminProtocol connection = null; 415 try { 416 connection = waitForMetaServerConnection(timeout); 417 } catch (NotAllMetaRegionsOnlineException e) { 418 // Pass 419 } catch (ServerNotRunningYetException e) { 420 // Pass -- remote server is not up so can't be carrying root 421 } catch (UnknownHostException e) { 422 // Pass -- server name doesn't resolve so it can't be assigned anything. 423 } 424 return (connection == null)? false: 425 verifyRegionLocation(connection, 426 this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME); 427 } 428 429 public HConnection getConnection() { 430 return this.connection; 431 } 432 }