View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.catalog;
21  
22  import java.io.EOFException;
23  import java.io.IOException;
24  import java.net.ConnectException;
25  import java.net.SocketTimeoutException;
26  import java.net.SocketException;
27  import java.util.concurrent.atomic.AtomicBoolean;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.hbase.Abortable;
32  import org.apache.hadoop.hbase.HRegionInfo;
33  import org.apache.hadoop.hbase.HServerAddress;
34  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
35  import org.apache.hadoop.hbase.NotServingRegionException;
36  import org.apache.hadoop.hbase.client.HConnection;
37  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
38  import org.apache.hadoop.hbase.ipc.HRegionInterface;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.zookeeper.MetaNodeTracker;
41  import org.apache.hadoop.hbase.zookeeper.RootRegionTracker;
42  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
43  import org.apache.hadoop.ipc.RemoteException;
44  
45  /**
46   * Tracks the availability of the catalog tables <code>-ROOT-</code> and
47   * <code>.META.</code>.
48   * 
49   * This class is "read-only" in that the locations of the catalog tables cannot
50   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
51   * and location of <code>-ROOT-</code>.  <code>-ROOT-</code> is used to learn of
52   * the location of <code>.META.</code>  If not available in <code>-ROOT-</code>,
53   * ZooKeeper is used to monitor for a new location of <code>.META.</code>.
54   *
55   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
56   * interrupt waits and close up shop.
57   */
58  public class CatalogTracker {
59    private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
60    private final HConnection connection;
61    private final ZooKeeperWatcher zookeeper;
62    private final RootRegionTracker rootRegionTracker;
63    private final MetaNodeTracker metaNodeTracker;
64    private final AtomicBoolean metaAvailable = new AtomicBoolean(false);
65    /**
66     * Do not clear this address once set.  Let it be cleared by
67     * {@link #setMetaLocation(HServerAddress)} only.  Its needed when we do
68     * server shutdown processing -- we need to know who had .META. last.  If you
69     * want to know if the address is good, rely on {@link #metaAvailable} value.
70     */
71    private HServerAddress metaLocation;
72    private final int defaultTimeout;
73    private boolean stopped = false;
74  
75    public static final byte [] ROOT_REGION =
76      HRegionInfo.ROOT_REGIONINFO.getRegionName();
77    public static final byte [] META_REGION =
78      HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
79  
80    /**
81     * Constructs a catalog tracker.  Find current state of catalog tables and
82     * begin active tracking by executing {@link #start()} post construction.
83     * Does not timeout.
84     * @param connection Server connection; if problem, this connections
85     * {@link HConnection#abort(String, Throwable)} will be called.
86     * @throws IOException 
87     */
88    public CatalogTracker(final HConnection connection) throws IOException {
89      this(connection.getZooKeeperWatcher(), connection, connection);
90    }
91  
92    /**
93     * Constructs the catalog tracker.  Find current state of catalog tables and
94     * begin active tracking by executing {@link #start()} post construction.
95     * Does not timeout.
96     * @param zk
97     * @param connection server connection
98     * @param abortable if fatal exception
99     * @throws IOException 
100    */
101   public CatalogTracker(final ZooKeeperWatcher zk, final HConnection connection,
102       final Abortable abortable)
103   throws IOException {
104     this(zk, connection, abortable, 0);
105   }
106 
107   /**
108    * Constructs the catalog tracker.  Find current state of catalog tables and
109    * begin active tracking by executing {@link #start()} post construction.
110    * @param zk
111    * @param connection server connection
112    * @param abortable if fatal exception
113    * @param defaultTimeout Timeout to use.  Pass zero for no timeout
114    * ({@link Object#wait(long)} when passed a <code>0</code> waits for ever).
115    * @throws IOException 
116    */
117   public CatalogTracker(final ZooKeeperWatcher zk, final HConnection connection,
118       final Abortable abortable, final int defaultTimeout)
119   throws IOException {
120     this.zookeeper = zk;
121     this.connection = connection;
122     this.rootRegionTracker = new RootRegionTracker(zookeeper, abortable);
123     this.metaNodeTracker = new MetaNodeTracker(zookeeper, this, abortable);
124     this.defaultTimeout = defaultTimeout;
125   }
126 
127   /**
128    * Starts the catalog tracker.
129    * Determines current availability of catalog tables and ensures all further
130    * transitions of either region are tracked.
131    * @throws IOException
132    * @throws InterruptedException 
133    */
134   public void start() throws IOException, InterruptedException {
135     this.rootRegionTracker.start();
136     this.metaNodeTracker.start();
137     LOG.debug("Starting catalog tracker " + this);
138   }
139 
140   /**
141    * Stop working.
142    * Interrupts any ongoing waits.
143    */
144   public void stop() {
145     LOG.debug("Stopping catalog tracker " + this);
146     this.stopped = true;
147     this.rootRegionTracker.stop();
148     this.metaNodeTracker.stop();
149     // Call this and it will interrupt any ongoing waits on meta.
150     synchronized (this.metaAvailable) {
151       this.metaAvailable.notifyAll();
152     }
153   }
154 
155   /**
156    * Gets the current location for <code>-ROOT-</code> or null if location is
157    * not currently available.
158    * @return location of root, null if not available
159    * @throws InterruptedException 
160    */
161   public HServerAddress getRootLocation() throws InterruptedException {
162     return this.rootRegionTracker.getRootRegionLocation();
163   }
164 
165   /**
166    * @return Location of meta or null if not yet available.
167    */
168   public HServerAddress getMetaLocation() {
169     return this.metaLocation;
170   }
171 
172   /**
173    * Waits indefinitely for availability of <code>-ROOT-</code>.  Used during
174    * cluster startup.
175    * @throws InterruptedException if interrupted while waiting
176    */
177   public void waitForRoot()
178   throws InterruptedException {
179     this.rootRegionTracker.blockUntilAvailable();
180   }
181 
182   /**
183    * Gets the current location for <code>-ROOT-</code> if available and waits
184    * for up to the specified timeout if not immediately available.  Returns null
185    * if the timeout elapses before root is available.
186    * @param timeout maximum time to wait for root availability, in milliseconds
187    * @return location of root
188    * @throws InterruptedException if interrupted while waiting
189    * @throws NotAllMetaRegionsOnlineException if root not available before
190    *                                          timeout
191    */
192   HServerAddress waitForRoot(final long timeout)
193   throws InterruptedException, NotAllMetaRegionsOnlineException {
194     HServerAddress address = rootRegionTracker.waitRootRegionLocation(timeout);
195     if (address == null) {
196       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
197     }
198     return address;
199   }
200 
201   /**
202    * Gets a connection to the server hosting root, as reported by ZooKeeper,
203    * waiting up to the specified timeout for availability.
204    * @see #waitForRoot(long) for additional information
205    * @return connection to server hosting root
206    * @throws InterruptedException
207    * @throws NotAllMetaRegionsOnlineException if timed out waiting
208    * @throws IOException
209    */
210   public HRegionInterface waitForRootServerConnection(long timeout)
211   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
212     return getCachedConnection(waitForRoot(timeout));
213   }
214 
215   /**
216    * Gets a connection to the server hosting root, as reported by ZooKeeper,
217    * waiting for the default timeout specified on instantiation.
218    * @see #waitForRoot(long) for additional information
219    * @return connection to server hosting root
220    * @throws NotAllMetaRegionsOnlineException if timed out waiting
221    * @throws IOException
222    */
223   public HRegionInterface waitForRootServerConnectionDefault()
224   throws NotAllMetaRegionsOnlineException, IOException {
225     try {
226       return getCachedConnection(waitForRoot(defaultTimeout));
227     } catch (InterruptedException e) {
228       throw new NotAllMetaRegionsOnlineException("Interrupted");
229     }
230   }
231 
232   /**
233    * Gets a connection to the server hosting root, as reported by ZooKeeper,
234    * if available.  Returns null if no location is immediately available.
235    * @return connection to server hosting root, null if not available
236    * @throws IOException
237    * @throws InterruptedException 
238    */
239   private HRegionInterface getRootServerConnection()
240   throws IOException, InterruptedException {
241     HServerAddress address = this.rootRegionTracker.getRootRegionLocation();
242     if (address == null) {
243       return null;
244     }
245     return getCachedConnection(address);
246   }
247 
248   /**
249    * Gets a connection to the server currently hosting <code>.META.</code> or
250    * null if location is not currently available.
251    * <p>
252    * If a location is known, a connection to the cached location is returned.
253    * If refresh is true, the cached connection is verified first before
254    * returning.  If the connection is not valid, it is reset and rechecked.
255    * <p>
256    * If no location for meta is currently known, method checks ROOT for a new
257    * location, verifies META is currently there, and returns a cached connection
258    * to the server hosting META.
259    *
260    * @return connection to server hosting meta, null if location not available
261    * @throws IOException
262    * @throws InterruptedException 
263    */
264   private HRegionInterface getMetaServerConnection(boolean refresh)
265   throws IOException, InterruptedException {
266     synchronized (metaAvailable) {
267       if (metaAvailable.get()) {
268         HRegionInterface current = getCachedConnection(metaLocation);
269         if (!refresh) {
270           return current;
271         }
272         if (verifyRegionLocation(current, this.metaLocation, META_REGION)) {
273           return current;
274         }
275         resetMetaLocation();
276       }
277       HRegionInterface rootConnection = getRootServerConnection();
278       if (rootConnection == null) {
279         return null;
280       }
281       HServerAddress newLocation = MetaReader.readMetaLocation(rootConnection);
282       if (newLocation == null) {
283         return null;
284       }
285       HRegionInterface newConnection = getCachedConnection(newLocation);
286       if (verifyRegionLocation(newConnection, this.metaLocation, META_REGION)) {
287         setMetaLocation(newLocation);
288         return newConnection;
289       }
290       return null;
291     }
292   }
293 
294   /**
295    * Waits indefinitely for availability of <code>.META.</code>.  Used during
296    * cluster startup.
297    * @throws InterruptedException if interrupted while waiting
298    */
299   public void waitForMeta() throws InterruptedException {
300     synchronized (metaAvailable) {
301       while (!stopped && !metaAvailable.get()) {
302         metaAvailable.wait();
303       }
304     }
305   }
306 
307   /**
308    * Gets the current location for <code>.META.</code> if available and waits
309    * for up to the specified timeout if not immediately available.  Throws an
310    * exception if timed out waiting.  This method differs from {@link #waitForMeta()}
311    * in that it will go ahead and verify the location gotten from ZooKeeper by
312    * trying to use returned connection.
313    * @param timeout maximum time to wait for meta availability, in milliseconds
314    * @return location of meta
315    * @throws InterruptedException if interrupted while waiting
316    * @throws IOException unexpected exception connecting to meta server
317    * @throws NotAllMetaRegionsOnlineException if meta not available before
318    *                                          timeout
319    */
320   public HServerAddress waitForMeta(long timeout)
321   throws InterruptedException, IOException, NotAllMetaRegionsOnlineException {
322     long stop = System.currentTimeMillis() + timeout;
323     synchronized (metaAvailable) {
324       while(!stopped && !metaAvailable.get() &&
325           (timeout == 0 || System.currentTimeMillis() < stop)) {
326         if (getMetaServerConnection(true) != null) {
327           return metaLocation;
328         }
329         metaAvailable.wait(timeout == 0 ? 50 : timeout);
330       }
331       if (getMetaServerConnection(true) == null) {
332         throw new NotAllMetaRegionsOnlineException(
333             "Timed out (" + timeout + "ms)");
334       }
335       return metaLocation;
336     }
337   }
338 
339   /**
340    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
341    * waiting up to the specified timeout for availability.
342    * @see #waitForMeta(long) for additional information
343    * @return connection to server hosting meta
344    * @throws InterruptedException
345    * @throws NotAllMetaRegionsOnlineException if timed out waiting
346    * @throws IOException
347    */
348   public HRegionInterface waitForMetaServerConnection(long timeout)
349   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
350     return getCachedConnection(waitForMeta(timeout));
351   }
352 
353   /**
354    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
355    * waiting up to the specified timeout for availability.
356    * @see #waitForMeta(long) for additional information
357    * @return connection to server hosting meta
358    * @throws NotAllMetaRegionsOnlineException if timed out or interrupted
359    * @throws IOException
360    */
361   public HRegionInterface waitForMetaServerConnectionDefault()
362   throws NotAllMetaRegionsOnlineException, IOException {
363     try {
364       return getCachedConnection(waitForMeta(defaultTimeout));
365     } catch (InterruptedException e) {
366       throw new NotAllMetaRegionsOnlineException("Interrupted");
367     }
368   }
369 
370   private void resetMetaLocation() {
371     LOG.info("Current cached META location is not valid, resetting");
372     this.metaAvailable.set(false);
373   }
374 
375   private void setMetaLocation(HServerAddress metaLocation) {
376     metaAvailable.set(true);
377     this.metaLocation = metaLocation;
378     // no synchronization because these are private and already under lock
379     metaAvailable.notifyAll();
380   }
381 
382   private HRegionInterface getCachedConnection(HServerAddress address)
383   throws IOException {
384     HRegionInterface protocol = null;
385     try {
386       protocol = connection.getHRegionConnection(address, false);
387     } catch (RetriesExhaustedException e) {
388       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
389         // Catch this; presume it means the cached connection has gone bad.
390       } else {
391         throw e;
392       }
393     } catch (SocketTimeoutException e) {
394       // Return 'protocol' == null.
395       LOG.debug("Timed out connecting to " + address);
396     } catch (SocketException e) {
397       // Return 'protocol' == null.
398       LOG.debug("Exception connecting to " + address);
399     } catch (IOException ioe) {
400       Throwable cause = ioe.getCause();
401       if (cause != null && cause instanceof EOFException) {
402         // Catch. Other end disconnected us.
403       } else if (cause != null && cause.getMessage() != null &&
404         cause.getMessage().toLowerCase().contains("connection reset")) {
405         // Catch. Connection reset.
406       } else {
407         throw ioe;
408       }
409       
410     }
411     return protocol;
412   }
413 
414   private boolean verifyRegionLocation(HRegionInterface metaServer,
415       final HServerAddress address,
416       byte [] regionName)
417   throws IOException {
418     if (metaServer == null) {
419       LOG.info("Passed metaserver is null");
420       return false;
421     }
422     Throwable t = null;
423     try {
424       return metaServer.getRegionInfo(regionName) != null;
425     } catch (ConnectException e) {
426       t = e;
427     } catch (RemoteException e) {
428       IOException ioe = e.unwrapRemoteException();
429       if (ioe instanceof NotServingRegionException) {
430         t = ioe;
431       } else {
432         throw e;
433       }
434     } catch (IOException e) {
435       Throwable cause = e.getCause();
436       if (cause != null && cause instanceof EOFException) {
437         t = cause;
438       } else if (cause != null && cause.getMessage() != null
439           && cause.getMessage().contains("Connection reset")) {
440         t = cause;
441       } else {
442         throw e;
443       }
444     }
445     LOG.info("Failed verification of " + Bytes.toString(regionName) +
446       " at address=" + address + "; " + t);
447     return false;
448   }
449 
450   /**
451    * Verify <code>-ROOT-</code> is deployed and accessible.
452    * @param timeout How long to wait on zk for root address (passed through to
453    * the internal call to {@link #waitForRootServerConnection(long)}.
454    * @return True if the <code>-ROOT-</code> location is healthy.
455    * @throws IOException
456    * @throws InterruptedException 
457    */
458   public boolean verifyRootRegionLocation(final long timeout)
459   throws InterruptedException, IOException {
460     HRegionInterface connection = null;
461     try {
462       connection = waitForRootServerConnection(timeout);
463     } catch (NotAllMetaRegionsOnlineException e) {
464       // Pass
465     } catch (org.apache.hadoop.hbase.ipc.ServerNotRunningException e) {
466       // Pass -- remote server is not up so can't be carrying root
467     } catch (IOException e) {
468       // Unexpected exception
469       throw e;
470     }
471     return (connection == null)? false:
472       verifyRegionLocation(connection,this.rootRegionTracker.getRootRegionLocation(),
473         HRegionInfo.ROOT_REGIONINFO.getRegionName());
474   }
475 
476   /**
477    * Verify <code>.META.</code> is deployed and accessible.
478    * @param timeout How long to wait on zk for <code>.META.</code> address
479    * (passed through to the internal call to {@link #waitForMetaServerConnection(long)}.
480    * @return True if the <code>.META.</code> location is healthy.
481    * @throws IOException Some unexpected IOE.
482    * @throws InterruptedException
483    */
484   public boolean verifyMetaRegionLocation(final long timeout)
485   throws InterruptedException, IOException {
486     return getMetaServerConnection(true) != null;
487   }
488 
489   MetaNodeTracker getMetaNodeTracker() {
490     return this.metaNodeTracker;
491   }
492 
493   public HConnection getConnection() {
494     return this.connection;
495   }
496 }