View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.catalog;
21  
22  import java.io.EOFException;
23  import java.io.IOException;
24  import java.net.ConnectException;
25  import java.net.NoRouteToHostException;
26  import java.net.SocketException;
27  import java.net.SocketTimeoutException;
28  import java.util.concurrent.atomic.AtomicBoolean;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.hadoop.hbase.Abortable;
33  import org.apache.hadoop.hbase.HRegionInfo;
34  import org.apache.hadoop.hbase.HServerAddress;
35  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
36  import org.apache.hadoop.hbase.NotServingRegionException;
37  import org.apache.hadoop.hbase.client.HConnection;
38  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
39  import org.apache.hadoop.hbase.ipc.HRegionInterface;
40  import org.apache.hadoop.hbase.util.Bytes;
41  import org.apache.hadoop.hbase.zookeeper.MetaNodeTracker;
42  import org.apache.hadoop.hbase.zookeeper.RootRegionTracker;
43  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
44  import org.apache.hadoop.ipc.RemoteException;
45  
46  /**
47   * Tracks the availability of the catalog tables <code>-ROOT-</code> and
48   * <code>.META.</code>.
49   * 
50   * This class is "read-only" in that the locations of the catalog tables cannot
51   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
52   * and location of <code>-ROOT-</code>.  <code>-ROOT-</code> is used to learn of
53   * the location of <code>.META.</code>  If not available in <code>-ROOT-</code>,
54   * ZooKeeper is used to monitor for a new location of <code>.META.</code>.
55   *
56   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
57   * interrupt waits and close up shop.
58   */
59  public class CatalogTracker {
60    private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
61    private final HConnection connection;
62    private final ZooKeeperWatcher zookeeper;
63    private final RootRegionTracker rootRegionTracker;
64    private final MetaNodeTracker metaNodeTracker;
65    private final AtomicBoolean metaAvailable = new AtomicBoolean(false);
66    /**
67     * Do not clear this address once set.  Let it be cleared by
68     * {@link #setMetaLocation(HServerAddress)} only.  Its needed when we do
69     * server shutdown processing -- we need to know who had .META. last.  If you
70     * want to know if the address is good, rely on {@link #metaAvailable} value.
71     */
72    private HServerAddress metaLocation;
73    private final int defaultTimeout;
74    private boolean stopped = false;
75  
76    public static final byte [] ROOT_REGION =
77      HRegionInfo.ROOT_REGIONINFO.getRegionName();
78    public static final byte [] META_REGION =
79      HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
80  
81    /**
82     * Constructs a catalog tracker.  Find current state of catalog tables and
83     * begin active tracking by executing {@link #start()} post construction.
84     * Does not timeout.
85     * @param connection Server connection; if problem, this connections
86     * {@link HConnection#abort(String, Throwable)} will be called.
87     * @throws IOException 
88     */
89    public CatalogTracker(final HConnection connection) throws IOException {
90      this(connection.getZooKeeperWatcher(), connection, connection);
91    }
92  
93    /**
94     * Constructs the catalog tracker.  Find current state of catalog tables and
95     * begin active tracking by executing {@link #start()} post construction.
96     * Does not timeout.
97     * @param zk
98     * @param connection server connection
99     * @param abortable if fatal exception
100    * @throws IOException 
101    */
102   public CatalogTracker(final ZooKeeperWatcher zk, final HConnection connection,
103       final Abortable abortable)
104   throws IOException {
105     this(zk, connection, abortable, 0);
106   }
107 
108   /**
109    * Constructs the catalog tracker.  Find current state of catalog tables and
110    * begin active tracking by executing {@link #start()} post construction.
111    * @param zk
112    * @param connection server connection
113    * @param abortable if fatal exception
114    * @param defaultTimeout Timeout to use.  Pass zero for no timeout
115    * ({@link Object#wait(long)} when passed a <code>0</code> waits for ever).
116    * @throws IOException 
117    */
118   public CatalogTracker(final ZooKeeperWatcher zk, final HConnection connection,
119       final Abortable abortable, final int defaultTimeout)
120   throws IOException {
121     this.zookeeper = zk;
122     this.connection = connection;
123     this.rootRegionTracker = new RootRegionTracker(zookeeper, abortable);
124     this.metaNodeTracker = new MetaNodeTracker(zookeeper, this, abortable);
125     this.defaultTimeout = defaultTimeout;
126   }
127 
128   /**
129    * Starts the catalog tracker.
130    * Determines current availability of catalog tables and ensures all further
131    * transitions of either region are tracked.
132    * @throws IOException
133    * @throws InterruptedException 
134    */
135   public void start() throws IOException, InterruptedException {
136     this.rootRegionTracker.start();
137     this.metaNodeTracker.start();
138     LOG.debug("Starting catalog tracker " + this);
139   }
140 
141   /**
142    * Stop working.
143    * Interrupts any ongoing waits.
144    */
145   public void stop() {
146     LOG.debug("Stopping catalog tracker " + this);
147     this.stopped = true;
148     this.rootRegionTracker.stop();
149     this.metaNodeTracker.stop();
150     // Call this and it will interrupt any ongoing waits on meta.
151     synchronized (this.metaAvailable) {
152       this.metaAvailable.notifyAll();
153     }
154   }
155 
156   /**
157    * Gets the current location for <code>-ROOT-</code> or null if location is
158    * not currently available.
159    * @return location of root, null if not available
160    * @throws InterruptedException 
161    */
162   public HServerAddress getRootLocation() throws InterruptedException {
163     return this.rootRegionTracker.getRootRegionLocation();
164   }
165 
166   /**
167    * @return Location of meta or null if not yet available.
168    */
169   public HServerAddress getMetaLocation() {
170     return this.metaLocation;
171   }
172 
173   /**
174    * Waits indefinitely for availability of <code>-ROOT-</code>.  Used during
175    * cluster startup.
176    * @throws InterruptedException if interrupted while waiting
177    */
178   public void waitForRoot()
179   throws InterruptedException {
180     this.rootRegionTracker.blockUntilAvailable();
181   }
182 
183   /**
184    * Gets the current location for <code>-ROOT-</code> if available and waits
185    * for up to the specified timeout if not immediately available.  Returns null
186    * if the timeout elapses before root is available.
187    * @param timeout maximum time to wait for root availability, in milliseconds
188    * @return location of root
189    * @throws InterruptedException if interrupted while waiting
190    * @throws NotAllMetaRegionsOnlineException if root not available before
191    *                                          timeout
192    */
193   HServerAddress waitForRoot(final long timeout)
194   throws InterruptedException, NotAllMetaRegionsOnlineException {
195     HServerAddress address = rootRegionTracker.waitRootRegionLocation(timeout);
196     if (address == null) {
197       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
198     }
199     return address;
200   }
201 
202   /**
203    * Gets a connection to the server hosting root, as reported by ZooKeeper,
204    * waiting up to the specified timeout for availability.
205    * @see #waitForRoot(long) for additional information
206    * @return connection to server hosting root
207    * @throws InterruptedException
208    * @throws NotAllMetaRegionsOnlineException if timed out waiting
209    * @throws IOException
210    */
211   public HRegionInterface waitForRootServerConnection(long timeout)
212   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
213     return getCachedConnection(waitForRoot(timeout));
214   }
215 
216   /**
217    * Gets a connection to the server hosting root, as reported by ZooKeeper,
218    * waiting for the default timeout specified on instantiation.
219    * @see #waitForRoot(long) for additional information
220    * @return connection to server hosting root
221    * @throws NotAllMetaRegionsOnlineException if timed out waiting
222    * @throws IOException
223    */
224   public HRegionInterface waitForRootServerConnectionDefault()
225   throws NotAllMetaRegionsOnlineException, IOException {
226     try {
227       return getCachedConnection(waitForRoot(defaultTimeout));
228     } catch (InterruptedException e) {
229       throw new NotAllMetaRegionsOnlineException("Interrupted");
230     }
231   }
232 
233   /**
234    * Gets a connection to the server hosting root, as reported by ZooKeeper,
235    * if available.  Returns null if no location is immediately available.
236    * @return connection to server hosting root, null if not available
237    * @throws IOException
238    * @throws InterruptedException 
239    */
240   private HRegionInterface getRootServerConnection()
241   throws IOException, InterruptedException {
242     HServerAddress address = this.rootRegionTracker.getRootRegionLocation();
243     if (address == null) {
244       return null;
245     }
246     return getCachedConnection(address);
247   }
248 
249   /**
250    * Gets a connection to the server currently hosting <code>.META.</code> or
251    * null if location is not currently available.
252    * <p>
253    * If a location is known, a connection to the cached location is returned.
254    * If refresh is true, the cached connection is verified first before
255    * returning.  If the connection is not valid, it is reset and rechecked.
256    * <p>
257    * If no location for meta is currently known, method checks ROOT for a new
258    * location, verifies META is currently there, and returns a cached connection
259    * to the server hosting META.
260    *
261    * @return connection to server hosting meta, null if location not available
262    * @throws IOException
263    * @throws InterruptedException 
264    */
265   private HRegionInterface getMetaServerConnection(boolean refresh)
266   throws IOException, InterruptedException {
267     synchronized (metaAvailable) {
268       if (metaAvailable.get()) {
269         HRegionInterface current = getCachedConnection(metaLocation);
270         if (!refresh) {
271           return current;
272         }
273         if (verifyRegionLocation(current, this.metaLocation, META_REGION)) {
274           return current;
275         }
276         resetMetaLocation();
277       }
278       HRegionInterface rootConnection = getRootServerConnection();
279       if (rootConnection == null) {
280         return null;
281       }
282       HServerAddress newLocation = MetaReader.readMetaLocation(rootConnection);
283       if (newLocation == null) {
284         return null;
285       }
286 
287       HRegionInterface newConnection = getCachedConnection(newLocation);
288       if (verifyRegionLocation(newConnection, this.metaLocation, META_REGION)) {
289         setMetaLocation(newLocation);
290         return newConnection;
291       }
292       return null;
293     }
294   }
295 
296   /**
297    * Waits indefinitely for availability of <code>.META.</code>.  Used during
298    * cluster startup.
299    * @throws InterruptedException if interrupted while waiting
300    */
301   public void waitForMeta() throws InterruptedException {
302     synchronized (metaAvailable) {
303       while (!stopped && !metaAvailable.get()) {
304         metaAvailable.wait();
305       }
306     }
307   }
308 
309   /**
310    * Gets the current location for <code>.META.</code> if available and waits
311    * for up to the specified timeout if not immediately available.  Throws an
312    * exception if timed out waiting.  This method differs from {@link #waitForMeta()}
313    * in that it will go ahead and verify the location gotten from ZooKeeper by
314    * trying to use returned connection.
315    * @param timeout maximum time to wait for meta availability, in milliseconds
316    * @return location of meta
317    * @throws InterruptedException if interrupted while waiting
318    * @throws IOException unexpected exception connecting to meta server
319    * @throws NotAllMetaRegionsOnlineException if meta not available before
320    *                                          timeout
321    */
322   public HServerAddress waitForMeta(long timeout)
323   throws InterruptedException, IOException, NotAllMetaRegionsOnlineException {
324     long stop = System.currentTimeMillis() + timeout;
325     synchronized (metaAvailable) {
326       while(!stopped && !metaAvailable.get() &&
327           (timeout == 0 || System.currentTimeMillis() < stop)) {
328         if (getMetaServerConnection(true) != null) {
329           return metaLocation;
330         }
331         metaAvailable.wait(timeout == 0 ? 50 : timeout);
332       }
333       if (getMetaServerConnection(true) == null) {
334         throw new NotAllMetaRegionsOnlineException(
335             "Timed out (" + timeout + "ms)");
336       }
337       return metaLocation;
338     }
339   }
340 
341   /**
342    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
343    * waiting up to the specified timeout for availability.
344    * @see #waitForMeta(long) for additional information
345    * @return connection to server hosting meta
346    * @throws InterruptedException
347    * @throws NotAllMetaRegionsOnlineException if timed out waiting
348    * @throws IOException
349    */
350   public HRegionInterface waitForMetaServerConnection(long timeout)
351   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
352     return getCachedConnection(waitForMeta(timeout));
353   }
354 
355   /**
356    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
357    * waiting up to the specified timeout for availability.
358    * @see #waitForMeta(long) for additional information
359    * @return connection to server hosting meta
360    * @throws NotAllMetaRegionsOnlineException if timed out or interrupted
361    * @throws IOException
362    */
363   public HRegionInterface waitForMetaServerConnectionDefault()
364   throws NotAllMetaRegionsOnlineException, IOException {
365     try {
366       return getCachedConnection(waitForMeta(defaultTimeout));
367     } catch (InterruptedException e) {
368       throw new NotAllMetaRegionsOnlineException("Interrupted");
369     }
370   }
371 
372   private void resetMetaLocation() {
373     LOG.info("Current cached META location is not valid, resetting");
374     this.metaAvailable.set(false);
375   }
376 
377   private void setMetaLocation(HServerAddress metaLocation) {
378     metaAvailable.set(true);
379     this.metaLocation = metaLocation;
380     // no synchronization because these are private and already under lock
381     metaAvailable.notifyAll();
382   }
383 
384   private HRegionInterface getCachedConnection(HServerAddress address)
385   throws IOException {
386     HRegionInterface protocol = null;
387     try {
388       protocol = connection.getHRegionConnection(address, false);
389     } catch (RetriesExhaustedException e) {
390       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
391         // Catch this; presume it means the cached connection has gone bad.
392       } else {
393         throw e;
394       }
395     } catch (SocketTimeoutException e) {
396       LOG.debug("Timed out connecting to " + address);
397     } catch (NoRouteToHostException e) {
398       LOG.debug("Connecting to " + address, e);
399     } catch (SocketException e) {
400       LOG.debug("Exception connecting to " + address);
401     } catch (IOException ioe) {
402       Throwable cause = ioe.getCause();
403       if (cause != null && cause instanceof EOFException) {
404         // Catch. Other end disconnected us.
405       } else if (cause != null && cause.getMessage() != null &&
406         cause.getMessage().toLowerCase().contains("connection reset")) {
407         // Catch. Connection reset.
408       } else {
409         throw ioe;
410       }
411       
412     }
413     return protocol;
414   }
415 
416   private boolean verifyRegionLocation(HRegionInterface metaServer,
417       final HServerAddress address,
418       byte [] regionName)
419   throws IOException {
420     if (metaServer == null) {
421       LOG.info("Passed metaserver is null");
422       return false;
423     }
424     Throwable t = null;
425     try {
426       return metaServer.getRegionInfo(regionName) != null;
427     } catch (ConnectException e) {
428       t = e;
429     } catch (RemoteException e) {
430       IOException ioe = e.unwrapRemoteException();
431       if (ioe instanceof NotServingRegionException) {
432         t = ioe;
433       } else {
434         throw e;
435       }
436     } catch (IOException e) {
437       Throwable cause = e.getCause();
438       if (cause != null && cause instanceof EOFException) {
439         t = cause;
440       } else if (cause != null && cause.getMessage() != null
441           && cause.getMessage().contains("Connection reset")) {
442         t = cause;
443       } else {
444         throw e;
445       }
446     }
447     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
448       " at address=" + address + "; " + t);
449     return false;
450   }
451 
452   /**
453    * Verify <code>-ROOT-</code> is deployed and accessible.
454    * @param timeout How long to wait on zk for root address (passed through to
455    * the internal call to {@link #waitForRootServerConnection(long)}.
456    * @return True if the <code>-ROOT-</code> location is healthy.
457    * @throws IOException
458    * @throws InterruptedException 
459    */
460   public boolean verifyRootRegionLocation(final long timeout)
461   throws InterruptedException, IOException {
462     HRegionInterface connection = null;
463     try {
464       connection = waitForRootServerConnection(timeout);
465     } catch (NotAllMetaRegionsOnlineException e) {
466       // Pass
467     } catch (org.apache.hadoop.hbase.ipc.ServerNotRunningException e) {
468       // Pass -- remote server is not up so can't be carrying root
469     } catch (IOException e) {
470       // Unexpected exception
471       throw e;
472     }
473     return (connection == null)? false:
474       verifyRegionLocation(connection,this.rootRegionTracker.getRootRegionLocation(),
475         HRegionInfo.ROOT_REGIONINFO.getRegionName());
476   }
477 
478   /**
479    * Verify <code>.META.</code> is deployed and accessible.
480    * @param timeout How long to wait on zk for <code>.META.</code> address
481    * (passed through to the internal call to {@link #waitForMetaServerConnection(long)}.
482    * @return True if the <code>.META.</code> location is healthy.
483    * @throws IOException Some unexpected IOE.
484    * @throws InterruptedException
485    */
486   public boolean verifyMetaRegionLocation(final long timeout)
487   throws InterruptedException, IOException {
488     return getMetaServerConnection(true) != null;
489   }
490 
491   MetaNodeTracker getMetaNodeTracker() {
492     return this.metaNodeTracker;
493   }
494 
495   public HConnection getConnection() {
496     return this.connection;
497   }
498 }