View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.catalog;
19  
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  import org.apache.hadoop.classification.InterfaceAudience;
23  import org.apache.hadoop.conf.Configuration;
24  import org.apache.hadoop.hbase.Abortable;
25  import org.apache.hadoop.hbase.HRegionInfo;
26  import org.apache.hadoop.hbase.ServerName;
27  import org.apache.hadoop.hbase.client.AdminProtocol;
28  import org.apache.hadoop.hbase.client.HConnection;
29  import org.apache.hadoop.hbase.client.HConnectionManager;
30  import org.apache.hadoop.hbase.client.HTable;
31  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
32  import org.apache.hadoop.hbase.exceptions.NotAllMetaRegionsOnlineException;
33  import org.apache.hadoop.hbase.exceptions.ServerNotRunningYetException;
34  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35  import org.apache.hadoop.hbase.util.Bytes;
36  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
37  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
38  import org.apache.hadoop.ipc.RemoteException;
39  
40  import java.io.EOFException;
41  import java.io.IOException;
42  import java.net.ConnectException;
43  import java.net.NoRouteToHostException;
44  import java.net.SocketException;
45  import java.net.SocketTimeoutException;
46  import java.net.UnknownHostException;
47  
48  /**
49   * Tracks the availability of the catalog tables
50   * <code>.META.</code>.
51   *
52   * This class is "read-only" in that the locations of the catalog tables cannot
53   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
54   * and location of <code>.META.</code>.
55   *
56   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
57   * interrupt waits and close up shop.
58   */
59  @InterfaceAudience.Private
60  public class CatalogTracker {
61    // TODO JDC 11/30 We don't even have ROOT anymore, revisit
62    // TODO: This class needs a rethink.  The original intent was that it would be
63    // the one-stop-shop for meta locations and that it would get this
64    // info from reading and watching zk state.  The class was to be used by
65    // servers when they needed to know of meta movement but also by
66    // client-side (inside in HTable) so rather than figure meta
67    // locations on fault, the client would instead get notifications out of zk.
68    //
69    // But this original intent is frustrated by the fact that this class has to
70    // read an hbase table, the -ROOT- table, to figure out the .META. region
71    // location which means we depend on an HConnection.  HConnection will do
72    // retrying but also, it has its own mechanism for finding root and meta
73    // locations (and for 'verifying'; it tries the location and if it fails, does
74    // new lookup, etc.).  So, at least for now, HConnection (or HTable) can't
75    // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
76    // For HT keep up a session with ZK?  Rather, shouldn't we do like asynchbase
77    // where we'd open a connection to zk, read what we need then let the
78    // connection go?).  The 'fix' is make it so both root and meta addresses
79    // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
80    //
81    // But even then, this class does 'verification' of the location and it does
82    // this by making a call over an HConnection (which will do its own root
83    // and meta lookups).  Isn't this verification 'useless' since when we
84    // return, whatever is dependent on the result of this call then needs to
85    // use HConnection; what we have verified may change in meantime (HConnection
86    // uses the CT primitives, the root and meta trackers finding root locations).
87    //
88    // When meta is moved to zk, this class may make more sense.  In the
89    // meantime, it does not cohere.  It should just watch meta and root and not
90    // NOT do verification -- let that be out in HConnection since its going to
91    // be done there ultimately anyways.
92    //
93    // This class has spread throughout the codebase.  It needs to be reigned in.
94    // This class should be used server-side only, even if we move meta location
95    // up into zk.  Currently its used over in the client package. Its used in
96    // MetaReader and MetaEditor classes usually just to get the Configuration
97    // its using (It does this indirectly by asking its HConnection for its
98    // Configuration and even then this is just used to get an HConnection out on
99    // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
100   // doing CT fixup. St.Ack 09/30/2011.
101   //
102 
103   // TODO: Timeouts have never been as advertised in here and its worse now
104   // with retries; i.e. the HConnection retries and pause goes ahead whatever
105   // the passed timeout is.  Fix.
106   private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
107   private final HConnection connection;
108   private final ZooKeeperWatcher zookeeper;
109   private final MetaRegionTracker metaRegionTracker;
110   private boolean instantiatedzkw = false;
111   private Abortable abortable;
112 
113   private boolean stopped = false;
114 
115   static final byte [] META_REGION_NAME =
116     HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
117 
118   /**
119    * Constructs a catalog tracker. Find current state of catalog tables.
120    * Begin active tracking by executing {@link #start()} post construction. Does
121    * not timeout.
122    *
123    * @param conf
124    *          the {@link Configuration} from which a {@link HConnection} will be
125    *          obtained; if problem, this connections
126    *          {@link HConnection#abort(String, Throwable)} will be called.
127    * @throws IOException
128    */
129   public CatalogTracker(final Configuration conf) throws IOException {
130     this(null, conf, null);
131   }
132 
133   /**
134    * Constructs the catalog tracker.  Find current state of catalog tables.
135    * Begin active tracking by executing {@link #start()} post construction.
136    * Does not timeout.
137    * @param zk If zk is null, we'll create an instance (and shut it down
138    * when {@link #stop()} is called) else we'll use what is passed.
139    * @param conf
140    * @param abortable If fatal exception we'll call abort on this.  May be null.
141    * If it is we'll use the Connection associated with the passed
142    * {@link Configuration} as our Abortable.
143    * @throws IOException
144    */
145   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
146       Abortable abortable)
147   throws IOException {
148     this(zk, conf, HConnectionManager.getConnection(conf), abortable);
149   }
150 
151   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
152       HConnection connection, Abortable abortable)
153   throws IOException {
154     this.connection = connection;
155     if (abortable == null) {
156       // A connection is abortable.
157       this.abortable = this.connection;
158     }
159     Abortable throwableAborter = new Abortable() {
160 
161       @Override
162       public void abort(String why, Throwable e) {
163         throw new RuntimeException(why, e);
164       }
165 
166       @Override
167       public boolean isAborted() {
168         return true;
169       }
170 
171     };
172     if (zk == null) {
173       // Create our own.  Set flag so we tear it down on stop.
174       this.zookeeper =
175         new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
176           abortable);
177       instantiatedzkw = true;
178     } else {
179       this.zookeeper = zk;
180     }
181     this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
182   }
183 
184   /**
185    * Starts the catalog tracker.
186    * Determines current availability of catalog tables and ensures all further
187    * transitions of either region are tracked.
188    * @throws IOException
189    * @throws InterruptedException
190    */
191   public void start() throws IOException, InterruptedException {
192     LOG.debug("Starting catalog tracker " + this);
193     try {
194       this.metaRegionTracker.start();
195     } catch (RuntimeException e) {
196       Throwable t = e.getCause();
197       this.abortable.abort(e.getMessage(), t);
198       throw new IOException("Attempt to start meta tracker failed.", t);
199     }
200   }
201 
202   /**
203    * Stop working.
204    * Interrupts any ongoing waits.
205    */
206   public void stop() {
207     if (!this.stopped) {
208       LOG.debug("Stopping catalog tracker " + this);
209       this.stopped = true;
210       this.metaRegionTracker.stop();
211       try {
212         if (this.connection != null) {
213           this.connection.close();
214         }
215       } catch (IOException e) {
216         // Although the {@link Closeable} interface throws an {@link
217         // IOException}, in reality, the implementation would never do that.
218         LOG.error("Attempt to close catalog tracker's connection failed.", e);
219       }
220       if (this.instantiatedzkw) {
221         this.zookeeper.close();
222       }
223     }
224   }
225 
226   /**
227    * Gets the current location for <code>.META.</code> or null if location is
228    * not currently available.
229    * @return {@link ServerName} for server hosting <code>.META.</code> or null
230    * if none available
231    * @throws InterruptedException
232    */
233   public ServerName getMetaLocation() throws InterruptedException {
234     return this.metaRegionTracker.getMetaRegionLocation();
235   }
236 
237   /**
238    * Gets the current location for <code>.META.</code> if available and waits
239    * for up to the specified timeout if not immediately available.  Returns null
240    * if the timeout elapses before root is available.
241    * @param timeout maximum time to wait for root availability, in milliseconds
242    * @return {@link ServerName} for server hosting <code>.META.</code> or null
243    * if none available
244    * @throws InterruptedException if interrupted while waiting
245    * @throws NotAllMetaRegionsOnlineException if meta not available before
246    * timeout
247    */
248   public ServerName waitForMeta(final long timeout)
249   throws InterruptedException, NotAllMetaRegionsOnlineException {
250     ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
251     if (sn == null) {
252       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
253     }
254     return sn;
255   }
256 
257   /**
258    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
259    * waiting up to the specified timeout for availability.
260    * @param timeout How long to wait on meta location
261    * @see #waitForMeta for additional information
262    * @return connection to server hosting meta
263    * @throws InterruptedException
264    * @throws NotAllMetaRegionsOnlineException if timed out waiting
265    * @throws IOException
266    * @deprecated Use #getMetaServerConnection(long)
267    */
268   public AdminProtocol waitForMetaServerConnection(long timeout)
269   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
270     return getMetaServerConnection(timeout);
271   }
272 
273   /**
274    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
275    * waiting up to the specified timeout for availability.
276    * <p>WARNING: Does not retry.  Use an {@link HTable} instead.
277    * @param timeout How long to wait on meta location
278    * @see #waitForMeta for additional information
279    * @return connection to server hosting meta
280    * @throws InterruptedException
281    * @throws NotAllMetaRegionsOnlineException if timed out waiting
282    * @throws IOException
283    */
284   AdminProtocol getMetaServerConnection(long timeout)
285   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
286     return getCachedConnection(waitForMeta(timeout));
287   }
288 
289   /**
290    * Waits indefinitely for availability of <code>.META.</code>.  Used during
291    * cluster startup.  Does not verify meta, just that something has been
292    * set up in zk.
293    * @see #waitForMeta(long)
294    * @throws InterruptedException if interrupted while waiting
295    */
296   public void waitForMeta() throws InterruptedException {
297     while (!this.stopped) {
298       try {
299         if (waitForMeta(100) != null) break;
300       } catch (NotAllMetaRegionsOnlineException e) {
301         if (LOG.isTraceEnabled()) {
302           LOG.info(".META. still not available, sleeping and retrying." +
303           " Reason: " + e.getMessage());
304         }
305       }
306     }
307   }
308 
309   /**
310    * @param sn ServerName to get a connection against.
311    * @return The AdminProtocol we got when we connected to <code>sn</code>
312    * May have come from cache, may not be good, may have been setup by this
313    * invocation, or may be null.
314    * @throws IOException
315    */
316   private AdminProtocol getCachedConnection(ServerName sn)
317   throws IOException {
318     if (sn == null) {
319       return null;
320     }
321     AdminProtocol protocol = null;
322     try {
323       protocol = connection.getAdmin(sn);
324     } catch (RetriesExhaustedException e) {
325       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
326         // Catch this; presume it means the cached connection has gone bad.
327       } else {
328         throw e;
329       }
330     } catch (SocketTimeoutException e) {
331       LOG.debug("Timed out connecting to " + sn);
332     } catch (NoRouteToHostException e) {
333       LOG.debug("Connecting to " + sn, e);
334     } catch (SocketException e) {
335       LOG.debug("Exception connecting to " + sn);
336     } catch (UnknownHostException e) {
337       LOG.debug("Unknown host exception connecting to  " + sn);
338     } catch (IOException ioe) {
339       Throwable cause = ioe.getCause();
340       if (ioe instanceof ConnectException) {
341         // Catch. Connect refused.
342       } else if (cause != null && cause instanceof EOFException) {
343         // Catch. Other end disconnected us.
344       } else if (cause != null && cause.getMessage() != null &&
345         cause.getMessage().toLowerCase().contains("connection reset")) {
346         // Catch. Connection reset.
347       } else {
348         throw ioe;
349       }
350 
351     }
352     return protocol;
353   }
354 
355   /**
356    * Verify we can connect to <code>hostingServer</code> and that its carrying
357    * <code>regionName</code>.
358    * @param hostingServer Interface to the server hosting <code>regionName</code>
359    * @param address The servername that goes with the <code>metaServer</code>
360    * Interface.  Used logging.
361    * @param regionName The regionname we are interested in.
362    * @return True if we were able to verify the region located at other side of
363    * the Interface.
364    * @throws IOException
365    */
366   // TODO: We should be able to get the ServerName from the AdminProtocol
367   // rather than have to pass it in.  Its made awkward by the fact that the
368   // HRI is likely a proxy against remote server so the getServerName needs
369   // to be fixed to go to a local method or to a cache before we can do this.
370   private boolean verifyRegionLocation(AdminProtocol hostingServer,
371       final ServerName address, final byte [] regionName)
372   throws IOException {
373     if (hostingServer == null) {
374       LOG.info("Passed hostingServer is null");
375       return false;
376     }
377     Throwable t = null;
378     try {
379       // Try and get regioninfo from the hosting server.
380       return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
381     } catch (ConnectException e) {
382       t = e;
383     } catch (RetriesExhaustedException e) {
384       t = e;
385     } catch (RemoteException e) {
386       IOException ioe = e.unwrapRemoteException();
387       t = ioe;
388     } catch (IOException e) {
389       Throwable cause = e.getCause();
390       if (cause != null && cause instanceof EOFException) {
391         t = cause;
392       } else if (cause != null && cause.getMessage() != null
393           && cause.getMessage().contains("Connection reset")) {
394         t = cause;
395       } else {
396         t = e;
397       }
398     }
399     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
400       " at address=" + address + ", exception=" + t);
401     return false;
402   }
403 
404   /**
405    * Verify <code>.META.</code> is deployed and accessible.
406    * @param timeout How long to wait on zk for meta address (passed through to
407    * the internal call to {@link #waitForMetaServerConnection(long)}.
408    * @return True if the <code>.META.</code> location is healthy.
409    * @throws IOException
410    * @throws InterruptedException
411    */
412   public boolean verifyMetaRegionLocation(final long timeout)
413   throws InterruptedException, IOException {
414     AdminProtocol connection = null;
415     try {
416       connection = waitForMetaServerConnection(timeout);
417     } catch (NotAllMetaRegionsOnlineException e) {
418       // Pass
419     } catch (ServerNotRunningYetException e) {
420       // Pass -- remote server is not up so can't be carrying root
421     } catch (UnknownHostException e) {
422       // Pass -- server name doesn't resolve so it can't be assigned anything.
423     }
424     return (connection == null)? false:
425       verifyRegionLocation(connection,
426           this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
427   }
428 
429   public HConnection getConnection() {
430     return this.connection;
431   }
432 }