View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.catalog;
19  
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  import org.apache.hadoop.classification.InterfaceAudience;
23  import org.apache.hadoop.conf.Configuration;
24  import org.apache.hadoop.hbase.Abortable;
25  import org.apache.hadoop.hbase.HRegionInfo;
26  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
27  import org.apache.hadoop.hbase.ServerName;
28  import org.apache.hadoop.hbase.client.HConnection;
29  import org.apache.hadoop.hbase.client.HConnectionManager;
30  import org.apache.hadoop.hbase.client.HTable;
31  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
32  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
33  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
34  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
35  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
36  import org.apache.hadoop.hbase.util.Bytes;
37  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
38  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
39  import org.apache.hadoop.ipc.RemoteException;
40  
41  import java.io.EOFException;
42  import java.io.IOException;
43  import java.net.ConnectException;
44  import java.net.NoRouteToHostException;
45  import java.net.SocketException;
46  import java.net.SocketTimeoutException;
47  import java.net.UnknownHostException;
48  
49  /**
50   * Tracks the availability of the catalog tables
51   * <code>hbase:meta</code>.
52   *
53   * This class is "read-only" in that the locations of the catalog tables cannot
54   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
55   * and location of <code>hbase:meta</code>.
56   *
57   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
58   * interrupt waits and close up shop.
59   */
60  @InterfaceAudience.Private
61  public class CatalogTracker {
62    // TODO JDC 11/30 We don't even have ROOT anymore, revisit
63    // TODO: This class needs a rethink.  The original intent was that it would be
64    // the one-stop-shop for meta locations and that it would get this
65    // info from reading and watching zk state.  The class was to be used by
66    // servers when they needed to know of meta movement but also by
67    // client-side (inside in HTable) so rather than figure meta
68    // locations on fault, the client would instead get notifications out of zk.
69    //
70    // But this original intent is frustrated by the fact that this class has to
71    // read an hbase table, the -ROOT- table, to figure out the hbase:meta region
72    // location which means we depend on an HConnection.  HConnection will do
73    // retrying but also, it has its own mechanism for finding root and meta
74    // locations (and for 'verifying'; it tries the location and if it fails, does
75    // new lookup, etc.).  So, at least for now, HConnection (or HTable) can't
76    // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
77    // For HT keep up a session with ZK?  Rather, shouldn't we do like asynchbase
78    // where we'd open a connection to zk, read what we need then let the
79    // connection go?).  The 'fix' is make it so both root and meta addresses
80    // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
81    //
82    // But even then, this class does 'verification' of the location and it does
83    // this by making a call over an HConnection (which will do its own root
84    // and meta lookups).  Isn't this verification 'useless' since when we
85    // return, whatever is dependent on the result of this call then needs to
86    // use HConnection; what we have verified may change in meantime (HConnection
87    // uses the CT primitives, the root and meta trackers finding root locations).
88    //
89    // When meta is moved to zk, this class may make more sense.  In the
90    // meantime, it does not cohere.  It should just watch meta and root and not
91    // NOT do verification -- let that be out in HConnection since its going to
92    // be done there ultimately anyways.
93    //
94    // This class has spread throughout the codebase.  It needs to be reigned in.
95    // This class should be used server-side only, even if we move meta location
96    // up into zk.  Currently its used over in the client package. Its used in
97    // MetaReader and MetaEditor classes usually just to get the Configuration
98    // its using (It does this indirectly by asking its HConnection for its
99    // Configuration and even then this is just used to get an HConnection out on
100   // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
101   // doing CT fixup. St.Ack 09/30/2011.
102   //
103 
104   // TODO: Timeouts have never been as advertised in here and its worse now
105   // with retries; i.e. the HConnection retries and pause goes ahead whatever
106   // the passed timeout is.  Fix.
107   private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
108   private final HConnection connection;
109   private final ZooKeeperWatcher zookeeper;
110   private final MetaRegionTracker metaRegionTracker;
111   private boolean instantiatedzkw = false;
112   private Abortable abortable;
113 
114   private boolean stopped = false;
115 
116   static final byte [] META_REGION_NAME =
117     HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
118 
119   /**
120    * Constructs a catalog tracker. Find current state of catalog tables.
121    * Begin active tracking by executing {@link #start()} post construction. Does
122    * not timeout.
123    *
124    * @param conf
125    *          the {@link Configuration} from which a {@link HConnection} will be
126    *          obtained; if problem, this connections
127    *          {@link HConnection#abort(String, Throwable)} will be called.
128    * @throws IOException
129    */
130   public CatalogTracker(final Configuration conf) throws IOException {
131     this(null, conf, null);
132   }
133 
134   /**
135    * Constructs the catalog tracker.  Find current state of catalog tables.
136    * Begin active tracking by executing {@link #start()} post construction.
137    * Does not timeout.
138    * @param zk If zk is null, we'll create an instance (and shut it down
139    * when {@link #stop()} is called) else we'll use what is passed.
140    * @param conf
141    * @param abortable If fatal exception we'll call abort on this.  May be null.
142    * If it is we'll use the Connection associated with the passed
143    * {@link Configuration} as our Abortable.
144    * @throws IOException
145    */
146   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
147       Abortable abortable)
148   throws IOException {
149     this(zk, conf, HConnectionManager.getConnection(conf), abortable);
150   }
151 
152   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
153       HConnection connection, Abortable abortable)
154   throws IOException {
155     this.connection = connection;
156     if (abortable == null) {
157       // A connection is abortable.
158       this.abortable = this.connection;
159     }
160     Abortable throwableAborter = new Abortable() {
161 
162       @Override
163       public void abort(String why, Throwable e) {
164         throw new RuntimeException(why, e);
165       }
166 
167       @Override
168       public boolean isAborted() {
169         return true;
170       }
171 
172     };
173     if (zk == null) {
174       // Create our own.  Set flag so we tear it down on stop.
175       this.zookeeper =
176         new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
177           abortable);
178       instantiatedzkw = true;
179     } else {
180       this.zookeeper = zk;
181     }
182     this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
183   }
184 
185   /**
186    * Starts the catalog tracker.
187    * Determines current availability of catalog tables and ensures all further
188    * transitions of either region are tracked.
189    * @throws IOException
190    * @throws InterruptedException
191    */
192   public void start() throws IOException, InterruptedException {
193     LOG.debug("Starting catalog tracker " + this);
194     try {
195       this.metaRegionTracker.start();
196     } catch (RuntimeException e) {
197       Throwable t = e.getCause();
198       this.abortable.abort(e.getMessage(), t);
199       throw new IOException("Attempt to start meta tracker failed.", t);
200     }
201   }
202 
203   /**
204    * Stop working.
205    * Interrupts any ongoing waits.
206    */
207   public void stop() {
208     if (!this.stopped) {
209       LOG.debug("Stopping catalog tracker " + this);
210       this.stopped = true;
211       this.metaRegionTracker.stop();
212       try {
213         if (this.connection != null) {
214           this.connection.close();
215         }
216       } catch (IOException e) {
217         // Although the {@link Closeable} interface throws an {@link
218         // IOException}, in reality, the implementation would never do that.
219         LOG.error("Attempt to close catalog tracker's connection failed.", e);
220       }
221       if (this.instantiatedzkw) {
222         this.zookeeper.close();
223       }
224     }
225   }
226 
227   /**
228    * Gets the current location for <code>hbase:meta</code> or null if location is
229    * not currently available.
230    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
231    * if none available
232    * @throws InterruptedException
233    */
234   public ServerName getMetaLocation() throws InterruptedException {
235     return this.metaRegionTracker.getMetaRegionLocation();
236   }
237 
238   /**
239    * Checks whether meta regionserver znode has some non null data.
240    * @return true if data is not null, false otherwise.
241    */
242   public boolean isMetaLocationAvailable() {
243     return this.metaRegionTracker.isLocationAvailable();
244   }
245   /**
246    * Gets the current location for <code>hbase:meta</code> if available and waits
247    * for up to the specified timeout if not immediately available.  Returns null
248    * if the timeout elapses before root is available.
249    * @param timeout maximum time to wait for root availability, in milliseconds
250    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
251    * if none available
252    * @throws InterruptedException if interrupted while waiting
253    * @throws NotAllMetaRegionsOnlineException if meta not available before
254    * timeout
255    */
256   public ServerName waitForMeta(final long timeout)
257   throws InterruptedException, NotAllMetaRegionsOnlineException {
258     ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
259     if (sn == null) {
260       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
261     }
262     return sn;
263   }
264 
265   /**
266    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
267    * waiting up to the specified timeout for availability.
268    * @param timeout How long to wait on meta location
269    * @see #waitForMeta for additional information
270    * @return connection to server hosting meta
271    * @throws InterruptedException
272    * @throws NotAllMetaRegionsOnlineException if timed out waiting
273    * @throws IOException
274    * @deprecated Use #getMetaServerConnection(long)
275    */
276   public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
277   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
278     return getMetaServerConnection(timeout);
279   }
280 
281   /**
282    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
283    * waiting up to the specified timeout for availability.
284    * <p>WARNING: Does not retry.  Use an {@link HTable} instead.
285    * @param timeout How long to wait on meta location
286    * @see #waitForMeta for additional information
287    * @return connection to server hosting meta
288    * @throws InterruptedException
289    * @throws NotAllMetaRegionsOnlineException if timed out waiting
290    * @throws IOException
291    */
292   AdminService.BlockingInterface getMetaServerConnection(long timeout)
293   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
294     return getCachedConnection(waitForMeta(timeout));
295   }
296 
297   /**
298    * Waits indefinitely for availability of <code>hbase:meta</code>.  Used during
299    * cluster startup.  Does not verify meta, just that something has been
300    * set up in zk.
301    * @see #waitForMeta(long)
302    * @throws InterruptedException if interrupted while waiting
303    */
304   public void waitForMeta() throws InterruptedException {
305     while (!this.stopped) {
306       try {
307         if (waitForMeta(100) != null) break;
308       } catch (NotAllMetaRegionsOnlineException e) {
309         if (LOG.isTraceEnabled()) {
310           LOG.trace("hbase:meta still not available, sleeping and retrying." +
311           " Reason: " + e.getMessage());
312         }
313       }
314     }
315   }
316 
317   /**
318    * @param sn ServerName to get a connection against.
319    * @return The AdminProtocol we got when we connected to <code>sn</code>
320    * May have come from cache, may not be good, may have been setup by this
321    * invocation, or may be null.
322    * @throws IOException
323    */
324   private AdminService.BlockingInterface getCachedConnection(ServerName sn)
325   throws IOException {
326     if (sn == null) {
327       return null;
328     }
329     AdminService.BlockingInterface service = null;
330     try {
331       service = connection.getAdmin(sn);
332     } catch (RetriesExhaustedException e) {
333       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
334         // Catch this; presume it means the cached connection has gone bad.
335       } else {
336         throw e;
337       }
338     } catch (SocketTimeoutException e) {
339       LOG.debug("Timed out connecting to " + sn);
340     } catch (NoRouteToHostException e) {
341       LOG.debug("Connecting to " + sn, e);
342     } catch (SocketException e) {
343       LOG.debug("Exception connecting to " + sn);
344     } catch (UnknownHostException e) {
345       LOG.debug("Unknown host exception connecting to  " + sn);
346     } catch (IOException ioe) {
347       Throwable cause = ioe.getCause();
348       if (ioe instanceof ConnectException) {
349         // Catch. Connect refused.
350       } else if (cause != null && cause instanceof EOFException) {
351         // Catch. Other end disconnected us.
352       } else if (cause != null && cause.getMessage() != null &&
353         cause.getMessage().toLowerCase().contains("connection reset")) {
354         // Catch. Connection reset.
355       } else {
356         throw ioe;
357       }
358 
359     }
360     return service;
361   }
362 
363   /**
364    * Verify we can connect to <code>hostingServer</code> and that its carrying
365    * <code>regionName</code>.
366    * @param hostingServer Interface to the server hosting <code>regionName</code>
367    * @param address The servername that goes with the <code>metaServer</code>
368    * Interface.  Used logging.
369    * @param regionName The regionname we are interested in.
370    * @return True if we were able to verify the region located at other side of
371    * the Interface.
372    * @throws IOException
373    */
374   // TODO: We should be able to get the ServerName from the AdminProtocol
375   // rather than have to pass it in.  Its made awkward by the fact that the
376   // HRI is likely a proxy against remote server so the getServerName needs
377   // to be fixed to go to a local method or to a cache before we can do this.
378   private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
379       final ServerName address, final byte [] regionName)
380   throws IOException {
381     if (hostingServer == null) {
382       LOG.info("Passed hostingServer is null");
383       return false;
384     }
385     Throwable t = null;
386     try {
387       // Try and get regioninfo from the hosting server.
388       return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
389     } catch (ConnectException e) {
390       t = e;
391     } catch (RetriesExhaustedException e) {
392       t = e;
393     } catch (RemoteException e) {
394       IOException ioe = e.unwrapRemoteException();
395       t = ioe;
396     } catch (IOException e) {
397       Throwable cause = e.getCause();
398       if (cause != null && cause instanceof EOFException) {
399         t = cause;
400       } else if (cause != null && cause.getMessage() != null
401           && cause.getMessage().contains("Connection reset")) {
402         t = cause;
403       } else {
404         t = e;
405       }
406     }
407     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
408       " at address=" + address + ", exception=" + t);
409     return false;
410   }
411 
412   /**
413    * Verify <code>hbase:meta</code> is deployed and accessible.
414    * @param timeout How long to wait on zk for meta address (passed through to
415    * the internal call to {@link #waitForMetaServerConnection(long)}.
416    * @return True if the <code>hbase:meta</code> location is healthy.
417    * @throws IOException
418    * @throws InterruptedException
419    */
420   public boolean verifyMetaRegionLocation(final long timeout)
421   throws InterruptedException, IOException {
422     AdminService.BlockingInterface service = null;
423     try {
424       service = waitForMetaServerConnection(timeout);
425     } catch (NotAllMetaRegionsOnlineException e) {
426       // Pass
427     } catch (ServerNotRunningYetException e) {
428       // Pass -- remote server is not up so can't be carrying root
429     } catch (UnknownHostException e) {
430       // Pass -- server name doesn't resolve so it can't be assigned anything.
431     } catch (RegionServerStoppedException e) {
432       // Pass -- server name sends us to a server that is dying or already dead.
433     }
434     return (service == null)? false:
435       verifyRegionLocation(service,
436           this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
437   }
438 
439   public HConnection getConnection() {
440     return this.connection;
441   }
442 }