View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.catalog;
19  
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  import org.apache.hadoop.classification.InterfaceAudience;
23  import org.apache.hadoop.conf.Configuration;
24  import org.apache.hadoop.hbase.Abortable;
25  import org.apache.hadoop.hbase.HRegionInfo;
26  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
27  import org.apache.hadoop.hbase.ServerName;
28  import org.apache.hadoop.hbase.client.HConnection;
29  import org.apache.hadoop.hbase.client.HConnectionManager;
30  import org.apache.hadoop.hbase.client.HTable;
31  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
32  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
33  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
34  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
35  import org.apache.hadoop.hbase.util.Bytes;
36  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
37  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
38  import org.apache.hadoop.ipc.RemoteException;
39  
40  import java.io.EOFException;
41  import java.io.IOException;
42  import java.net.ConnectException;
43  import java.net.NoRouteToHostException;
44  import java.net.SocketException;
45  import java.net.SocketTimeoutException;
46  import java.net.UnknownHostException;
47  
48  /**
49   * Tracks the availability of the catalog tables
50   * <code>.META.</code>.
51   *
52   * This class is "read-only" in that the locations of the catalog tables cannot
53   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
54   * and location of <code>.META.</code>.
55   *
56   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
57   * interrupt waits and close up shop.
58   */
59  @InterfaceAudience.Private
60  public class CatalogTracker {
61    // TODO JDC 11/30 We don't even have ROOT anymore, revisit
62    // TODO: This class needs a rethink.  The original intent was that it would be
63    // the one-stop-shop for meta locations and that it would get this
64    // info from reading and watching zk state.  The class was to be used by
65    // servers when they needed to know of meta movement but also by
66    // client-side (inside in HTable) so rather than figure meta
67    // locations on fault, the client would instead get notifications out of zk.
68    //
69    // But this original intent is frustrated by the fact that this class has to
70    // read an hbase table, the -ROOT- table, to figure out the .META. region
71    // location which means we depend on an HConnection.  HConnection will do
72    // retrying but also, it has its own mechanism for finding root and meta
73    // locations (and for 'verifying'; it tries the location and if it fails, does
74    // new lookup, etc.).  So, at least for now, HConnection (or HTable) can't
75    // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
76    // For HT keep up a session with ZK?  Rather, shouldn't we do like asynchbase
77    // where we'd open a connection to zk, read what we need then let the
78    // connection go?).  The 'fix' is make it so both root and meta addresses
79    // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
80    //
81    // But even then, this class does 'verification' of the location and it does
82    // this by making a call over an HConnection (which will do its own root
83    // and meta lookups).  Isn't this verification 'useless' since when we
84    // return, whatever is dependent on the result of this call then needs to
85    // use HConnection; what we have verified may change in meantime (HConnection
86    // uses the CT primitives, the root and meta trackers finding root locations).
87    //
88    // When meta is moved to zk, this class may make more sense.  In the
89    // meantime, it does not cohere.  It should just watch meta and root and not
90    // NOT do verification -- let that be out in HConnection since its going to
91    // be done there ultimately anyways.
92    //
93    // This class has spread throughout the codebase.  It needs to be reigned in.
94    // This class should be used server-side only, even if we move meta location
95    // up into zk.  Currently its used over in the client package. Its used in
96    // MetaReader and MetaEditor classes usually just to get the Configuration
97    // its using (It does this indirectly by asking its HConnection for its
98    // Configuration and even then this is just used to get an HConnection out on
99    // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
100   // doing CT fixup. St.Ack 09/30/2011.
101   //
102 
103   // TODO: Timeouts have never been as advertised in here and its worse now
104   // with retries; i.e. the HConnection retries and pause goes ahead whatever
105   // the passed timeout is.  Fix.
106   private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
107   private final HConnection connection;
108   private final ZooKeeperWatcher zookeeper;
109   private final MetaRegionTracker metaRegionTracker;
110   private boolean instantiatedzkw = false;
111   private Abortable abortable;
112 
113   private boolean stopped = false;
114 
115   static final byte [] META_REGION_NAME =
116     HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
117 
118   /**
119    * Constructs a catalog tracker. Find current state of catalog tables.
120    * Begin active tracking by executing {@link #start()} post construction. Does
121    * not timeout.
122    *
123    * @param conf
124    *          the {@link Configuration} from which a {@link HConnection} will be
125    *          obtained; if problem, this connections
126    *          {@link HConnection#abort(String, Throwable)} will be called.
127    * @throws IOException
128    */
129   public CatalogTracker(final Configuration conf) throws IOException {
130     this(null, conf, null);
131   }
132 
133   /**
134    * Constructs the catalog tracker.  Find current state of catalog tables.
135    * Begin active tracking by executing {@link #start()} post construction.
136    * Does not timeout.
137    * @param zk If zk is null, we'll create an instance (and shut it down
138    * when {@link #stop()} is called) else we'll use what is passed.
139    * @param conf
140    * @param abortable If fatal exception we'll call abort on this.  May be null.
141    * If it is we'll use the Connection associated with the passed
142    * {@link Configuration} as our Abortable.
143    * @throws IOException
144    */
145   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
146       Abortable abortable)
147   throws IOException {
148     this(zk, conf, HConnectionManager.getConnection(conf), abortable);
149   }
150 
151   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
152       HConnection connection, Abortable abortable)
153   throws IOException {
154     this.connection = connection;
155     if (abortable == null) {
156       // A connection is abortable.
157       this.abortable = this.connection;
158     }
159     Abortable throwableAborter = new Abortable() {
160 
161       @Override
162       public void abort(String why, Throwable e) {
163         throw new RuntimeException(why, e);
164       }
165 
166       @Override
167       public boolean isAborted() {
168         return true;
169       }
170 
171     };
172     if (zk == null) {
173       // Create our own.  Set flag so we tear it down on stop.
174       this.zookeeper =
175         new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
176           abortable);
177       instantiatedzkw = true;
178     } else {
179       this.zookeeper = zk;
180     }
181     this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
182   }
183 
184   /**
185    * Starts the catalog tracker.
186    * Determines current availability of catalog tables and ensures all further
187    * transitions of either region are tracked.
188    * @throws IOException
189    * @throws InterruptedException
190    */
191   public void start() throws IOException, InterruptedException {
192     LOG.debug("Starting catalog tracker " + this);
193     try {
194       this.metaRegionTracker.start();
195     } catch (RuntimeException e) {
196       Throwable t = e.getCause();
197       this.abortable.abort(e.getMessage(), t);
198       throw new IOException("Attempt to start meta tracker failed.", t);
199     }
200   }
201 
202   /**
203    * Stop working.
204    * Interrupts any ongoing waits.
205    */
206   public void stop() {
207     if (!this.stopped) {
208       LOG.debug("Stopping catalog tracker " + this);
209       this.stopped = true;
210       this.metaRegionTracker.stop();
211       try {
212         if (this.connection != null) {
213           this.connection.close();
214         }
215       } catch (IOException e) {
216         // Although the {@link Closeable} interface throws an {@link
217         // IOException}, in reality, the implementation would never do that.
218         LOG.error("Attempt to close catalog tracker's connection failed.", e);
219       }
220       if (this.instantiatedzkw) {
221         this.zookeeper.close();
222       }
223     }
224   }
225 
226   /**
227    * Gets the current location for <code>.META.</code> or null if location is
228    * not currently available.
229    * @return {@link ServerName} for server hosting <code>.META.</code> or null
230    * if none available
231    * @throws InterruptedException
232    */
233   public ServerName getMetaLocation() throws InterruptedException {
234     return this.metaRegionTracker.getMetaRegionLocation();
235   }
236 
237   /**
238    * Checks whether meta regionserver znode has some non null data.
239    * @return true if data is not null, false otherwise.
240    */
241   public boolean isMetaLocationAvailable() {
242     return this.metaRegionTracker.isLocationAvailable();
243   }
244   /**
245    * Gets the current location for <code>.META.</code> if available and waits
246    * for up to the specified timeout if not immediately available.  Returns null
247    * if the timeout elapses before root is available.
248    * @param timeout maximum time to wait for root availability, in milliseconds
249    * @return {@link ServerName} for server hosting <code>.META.</code> or null
250    * if none available
251    * @throws InterruptedException if interrupted while waiting
252    * @throws NotAllMetaRegionsOnlineException if meta not available before
253    * timeout
254    */
255   public ServerName waitForMeta(final long timeout)
256   throws InterruptedException, NotAllMetaRegionsOnlineException {
257     ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
258     if (sn == null) {
259       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
260     }
261     return sn;
262   }
263 
264   /**
265    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
266    * waiting up to the specified timeout for availability.
267    * @param timeout How long to wait on meta location
268    * @see #waitForMeta for additional information
269    * @return connection to server hosting meta
270    * @throws InterruptedException
271    * @throws NotAllMetaRegionsOnlineException if timed out waiting
272    * @throws IOException
273    * @deprecated Use #getMetaServerConnection(long)
274    */
275   public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
276   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
277     return getMetaServerConnection(timeout);
278   }
279 
280   /**
281    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
282    * waiting up to the specified timeout for availability.
283    * <p>WARNING: Does not retry.  Use an {@link HTable} instead.
284    * @param timeout How long to wait on meta location
285    * @see #waitForMeta for additional information
286    * @return connection to server hosting meta
287    * @throws InterruptedException
288    * @throws NotAllMetaRegionsOnlineException if timed out waiting
289    * @throws IOException
290    */
291   AdminService.BlockingInterface getMetaServerConnection(long timeout)
292   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
293     return getCachedConnection(waitForMeta(timeout));
294   }
295 
296   /**
297    * Waits indefinitely for availability of <code>.META.</code>.  Used during
298    * cluster startup.  Does not verify meta, just that something has been
299    * set up in zk.
300    * @see #waitForMeta(long)
301    * @throws InterruptedException if interrupted while waiting
302    */
303   public void waitForMeta() throws InterruptedException {
304     while (!this.stopped) {
305       try {
306         if (waitForMeta(100) != null) break;
307       } catch (NotAllMetaRegionsOnlineException e) {
308         if (LOG.isTraceEnabled()) {
309           LOG.info(".META. still not available, sleeping and retrying." +
310           " Reason: " + e.getMessage());
311         }
312       }
313     }
314   }
315 
316   /**
317    * @param sn ServerName to get a connection against.
318    * @return The AdminProtocol we got when we connected to <code>sn</code>
319    * May have come from cache, may not be good, may have been setup by this
320    * invocation, or may be null.
321    * @throws IOException
322    */
323   private AdminService.BlockingInterface getCachedConnection(ServerName sn)
324   throws IOException {
325     if (sn == null) {
326       return null;
327     }
328     AdminService.BlockingInterface service = null;
329     try {
330       service = connection.getAdmin(sn);
331     } catch (RetriesExhaustedException e) {
332       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
333         // Catch this; presume it means the cached connection has gone bad.
334       } else {
335         throw e;
336       }
337     } catch (SocketTimeoutException e) {
338       LOG.debug("Timed out connecting to " + sn);
339     } catch (NoRouteToHostException e) {
340       LOG.debug("Connecting to " + sn, e);
341     } catch (SocketException e) {
342       LOG.debug("Exception connecting to " + sn);
343     } catch (UnknownHostException e) {
344       LOG.debug("Unknown host exception connecting to  " + sn);
345     } catch (IOException ioe) {
346       Throwable cause = ioe.getCause();
347       if (ioe instanceof ConnectException) {
348         // Catch. Connect refused.
349       } else if (cause != null && cause instanceof EOFException) {
350         // Catch. Other end disconnected us.
351       } else if (cause != null && cause.getMessage() != null &&
352         cause.getMessage().toLowerCase().contains("connection reset")) {
353         // Catch. Connection reset.
354       } else {
355         throw ioe;
356       }
357 
358     }
359     return service;
360   }
361 
362   /**
363    * Verify we can connect to <code>hostingServer</code> and that its carrying
364    * <code>regionName</code>.
365    * @param hostingServer Interface to the server hosting <code>regionName</code>
366    * @param address The servername that goes with the <code>metaServer</code>
367    * Interface.  Used logging.
368    * @param regionName The regionname we are interested in.
369    * @return True if we were able to verify the region located at other side of
370    * the Interface.
371    * @throws IOException
372    */
373   // TODO: We should be able to get the ServerName from the AdminProtocol
374   // rather than have to pass it in.  Its made awkward by the fact that the
375   // HRI is likely a proxy against remote server so the getServerName needs
376   // to be fixed to go to a local method or to a cache before we can do this.
377   private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
378       final ServerName address, final byte [] regionName)
379   throws IOException {
380     if (hostingServer == null) {
381       LOG.info("Passed hostingServer is null");
382       return false;
383     }
384     Throwable t = null;
385     try {
386       // Try and get regioninfo from the hosting server.
387       return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
388     } catch (ConnectException e) {
389       t = e;
390     } catch (RetriesExhaustedException e) {
391       t = e;
392     } catch (RemoteException e) {
393       IOException ioe = e.unwrapRemoteException();
394       t = ioe;
395     } catch (IOException e) {
396       Throwable cause = e.getCause();
397       if (cause != null && cause instanceof EOFException) {
398         t = cause;
399       } else if (cause != null && cause.getMessage() != null
400           && cause.getMessage().contains("Connection reset")) {
401         t = cause;
402       } else {
403         t = e;
404       }
405     }
406     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
407       " at address=" + address + ", exception=" + t);
408     return false;
409   }
410 
411   /**
412    * Verify <code>.META.</code> is deployed and accessible.
413    * @param timeout How long to wait on zk for meta address (passed through to
414    * the internal call to {@link #waitForMetaServerConnection(long)}.
415    * @return True if the <code>.META.</code> location is healthy.
416    * @throws IOException
417    * @throws InterruptedException
418    */
419   public boolean verifyMetaRegionLocation(final long timeout)
420   throws InterruptedException, IOException {
421     AdminService.BlockingInterface service = null;
422     try {
423       service = waitForMetaServerConnection(timeout);
424     } catch (NotAllMetaRegionsOnlineException e) {
425       // Pass
426     } catch (ServerNotRunningYetException e) {
427       // Pass -- remote server is not up so can't be carrying root
428     } catch (UnknownHostException e) {
429       // Pass -- server name doesn't resolve so it can't be assigned anything.
430     }
431     return (service == null)? false:
432       verifyRegionLocation(service,
433           this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
434   }
435 
436   public HConnection getConnection() {
437     return this.connection;
438   }
439 }