View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.catalog;
21  
22  import java.io.EOFException;
23  import java.io.IOException;
24  import java.net.ConnectException;
25  import java.net.SocketTimeoutException;
26  import java.util.concurrent.atomic.AtomicBoolean;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.Abortable;
31  import org.apache.hadoop.hbase.HRegionInfo;
32  import org.apache.hadoop.hbase.HServerAddress;
33  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
34  import org.apache.hadoop.hbase.NotServingRegionException;
35  import org.apache.hadoop.hbase.client.HConnection;
36  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
37  import org.apache.hadoop.hbase.ipc.HRegionInterface;
38  import org.apache.hadoop.hbase.util.Bytes;
39  import org.apache.hadoop.hbase.zookeeper.MetaNodeTracker;
40  import org.apache.hadoop.hbase.zookeeper.RootRegionTracker;
41  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
42  import org.apache.hadoop.ipc.RemoteException;
43  
44  /**
45   * Tracks the availability of the catalog tables <code>-ROOT-</code> and
46   * <code>.META.</code>.
47   * 
48   * This class is "read-only" in that the locations of the catalog tables cannot
49   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
50   * and location of <code>-ROOT-</code>.  <code>-ROOT-</code> is used to learn of
51   * the location of <code>.META.</code>  If not available in <code>-ROOT-</code>,
52   * ZooKeeper is used to monitor for a new location of <code>.META.</code>.
53   *
54   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
55   * interrupt waits and close up shop.
56   */
57  public class CatalogTracker {
58    private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
59    private final HConnection connection;
60    private final ZooKeeperWatcher zookeeper;
61    private final RootRegionTracker rootRegionTracker;
62    private final MetaNodeTracker metaNodeTracker;
63    private final AtomicBoolean metaAvailable = new AtomicBoolean(false);
64    /**
65     * Do not clear this address once set.  Let it be cleared by
66     * {@link #setMetaLocation(HServerAddress)} only.  Its needed when we do
67     * server shutdown processing -- we need to know who had .META. last.  If you
68     * want to know if the address is good, rely on {@link #metaAvailable} value.
69     */
70    private HServerAddress metaLocation;
71    private final int defaultTimeout;
72    private boolean stopped = false;
73  
74    public static final byte [] ROOT_REGION =
75      HRegionInfo.ROOT_REGIONINFO.getRegionName();
76    public static final byte [] META_REGION =
77      HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
78  
79    /**
80     * Constructs a catalog tracker.  Find current state of catalog tables and
81     * begin active tracking by executing {@link #start()} post construction.
82     * Does not timeout.
83     * @param connection Server connection; if problem, this connections
84     * {@link HConnection#abort(String, Throwable)} will be called.
85     * @throws IOException 
86     */
87    public CatalogTracker(final HConnection connection) throws IOException {
88      this(connection.getZooKeeperWatcher(), connection, connection);
89    }
90  
91    /**
92     * Constructs the catalog tracker.  Find current state of catalog tables and
93     * begin active tracking by executing {@link #start()} post construction.
94     * Does not timeout.
95     * @param zk
96     * @param connection server connection
97     * @param abortable if fatal exception
98     * @throws IOException 
99     */
100   public CatalogTracker(final ZooKeeperWatcher zk, final HConnection connection,
101       final Abortable abortable)
102   throws IOException {
103     this(zk, connection, abortable, 0);
104   }
105 
106   /**
107    * Constructs the catalog tracker.  Find current state of catalog tables and
108    * begin active tracking by executing {@link #start()} post construction.
109    * @param zk
110    * @param connection server connection
111    * @param abortable if fatal exception
112    * @param defaultTimeout Timeout to use.  Pass zero for no timeout
113    * ({@link Object#wait(long)} when passed a <code>0</code> waits for ever).
114    * @throws IOException 
115    */
116   public CatalogTracker(final ZooKeeperWatcher zk, final HConnection connection,
117       final Abortable abortable, final int defaultTimeout)
118   throws IOException {
119     this.zookeeper = zk;
120     this.connection = connection;
121     this.rootRegionTracker = new RootRegionTracker(zookeeper, abortable);
122     this.metaNodeTracker = new MetaNodeTracker(zookeeper, this, abortable);
123     this.defaultTimeout = defaultTimeout;
124   }
125 
126   /**
127    * Starts the catalog tracker.
128    * Determines current availability of catalog tables and ensures all further
129    * transitions of either region are tracked.
130    * @throws IOException
131    * @throws InterruptedException 
132    */
133   public void start() throws IOException, InterruptedException {
134     this.rootRegionTracker.start();
135     this.metaNodeTracker.start();
136     LOG.debug("Starting catalog tracker " + this);
137   }
138 
139   /**
140    * Stop working.
141    * Interrupts any ongoing waits.
142    */
143   public void stop() {
144     LOG.debug("Stopping catalog tracker " + this);
145     this.stopped = true;
146     this.rootRegionTracker.stop();
147     this.metaNodeTracker.stop();
148     // Call this and it will interrupt any ongoing waits on meta.
149     synchronized (this.metaAvailable) {
150       this.metaAvailable.notifyAll();
151     }
152   }
153 
154   /**
155    * Gets the current location for <code>-ROOT-</code> or null if location is
156    * not currently available.
157    * @return location of root, null if not available
158    * @throws InterruptedException 
159    */
160   public HServerAddress getRootLocation() throws InterruptedException {
161     return this.rootRegionTracker.getRootRegionLocation();
162   }
163 
164   /**
165    * @return Location of meta or null if not yet available.
166    */
167   public HServerAddress getMetaLocation() {
168     return this.metaLocation;
169   }
170 
171   /**
172    * Waits indefinitely for availability of <code>-ROOT-</code>.  Used during
173    * cluster startup.
174    * @throws InterruptedException if interrupted while waiting
175    */
176   public void waitForRoot()
177   throws InterruptedException {
178     this.rootRegionTracker.blockUntilAvailable();
179   }
180 
181   /**
182    * Gets the current location for <code>-ROOT-</code> if available and waits
183    * for up to the specified timeout if not immediately available.  Returns null
184    * if the timeout elapses before root is available.
185    * @param timeout maximum time to wait for root availability, in milliseconds
186    * @return location of root
187    * @throws InterruptedException if interrupted while waiting
188    * @throws NotAllMetaRegionsOnlineException if root not available before
189    *                                          timeout
190    */
191   HServerAddress waitForRoot(final long timeout)
192   throws InterruptedException, NotAllMetaRegionsOnlineException {
193     HServerAddress address = rootRegionTracker.waitRootRegionLocation(timeout);
194     if (address == null) {
195       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
196     }
197     return address;
198   }
199 
200   /**
201    * Gets a connection to the server hosting root, as reported by ZooKeeper,
202    * waiting up to the specified timeout for availability.
203    * @see #waitForRoot(long) for additional information
204    * @return connection to server hosting root
205    * @throws InterruptedException
206    * @throws NotAllMetaRegionsOnlineException if timed out waiting
207    * @throws IOException
208    */
209   public HRegionInterface waitForRootServerConnection(long timeout)
210   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
211     return getCachedConnection(waitForRoot(timeout));
212   }
213 
214   /**
215    * Gets a connection to the server hosting root, as reported by ZooKeeper,
216    * waiting for the default timeout specified on instantiation.
217    * @see #waitForRoot(long) for additional information
218    * @return connection to server hosting root
219    * @throws NotAllMetaRegionsOnlineException if timed out waiting
220    * @throws IOException
221    */
222   public HRegionInterface waitForRootServerConnectionDefault()
223   throws NotAllMetaRegionsOnlineException, IOException {
224     try {
225       return getCachedConnection(waitForRoot(defaultTimeout));
226     } catch (InterruptedException e) {
227       throw new NotAllMetaRegionsOnlineException("Interrupted");
228     }
229   }
230 
231   /**
232    * Gets a connection to the server hosting root, as reported by ZooKeeper,
233    * if available.  Returns null if no location is immediately available.
234    * @return connection to server hosting root, null if not available
235    * @throws IOException
236    * @throws InterruptedException 
237    */
238   private HRegionInterface getRootServerConnection()
239   throws IOException, InterruptedException {
240     HServerAddress address = this.rootRegionTracker.getRootRegionLocation();
241     if (address == null) {
242       return null;
243     }
244     return getCachedConnection(address);
245   }
246 
247   /**
248    * Gets a connection to the server currently hosting <code>.META.</code> or
249    * null if location is not currently available.
250    * <p>
251    * If a location is known, a connection to the cached location is returned.
252    * If refresh is true, the cached connection is verified first before
253    * returning.  If the connection is not valid, it is reset and rechecked.
254    * <p>
255    * If no location for meta is currently known, method checks ROOT for a new
256    * location, verifies META is currently there, and returns a cached connection
257    * to the server hosting META.
258    *
259    * @return connection to server hosting meta, null if location not available
260    * @throws IOException
261    * @throws InterruptedException 
262    */
263   private HRegionInterface getMetaServerConnection(boolean refresh)
264   throws IOException, InterruptedException {
265     synchronized (metaAvailable) {
266       if (metaAvailable.get()) {
267         HRegionInterface current = getCachedConnection(metaLocation);
268         if (!refresh) {
269           return current;
270         }
271         if (verifyRegionLocation(current, this.metaLocation, META_REGION)) {
272           return current;
273         }
274         resetMetaLocation();
275       }
276       HRegionInterface rootConnection = getRootServerConnection();
277       if (rootConnection == null) {
278         return null;
279       }
280       HServerAddress newLocation = MetaReader.readMetaLocation(rootConnection);
281       if (newLocation == null) {
282         return null;
283       }
284       HRegionInterface newConnection = getCachedConnection(newLocation);
285       if (verifyRegionLocation(newConnection, this.metaLocation, META_REGION)) {
286         setMetaLocation(newLocation);
287         return newConnection;
288       }
289       return null;
290     }
291   }
292 
293   /**
294    * Waits indefinitely for availability of <code>.META.</code>.  Used during
295    * cluster startup.
296    * @throws InterruptedException if interrupted while waiting
297    */
298   public void waitForMeta() throws InterruptedException {
299     synchronized (metaAvailable) {
300       while (!stopped && !metaAvailable.get()) {
301         metaAvailable.wait();
302       }
303     }
304   }
305 
306   /**
307    * Gets the current location for <code>.META.</code> if available and waits
308    * for up to the specified timeout if not immediately available.  Throws an
309    * exception if timed out waiting.  This method differs from {@link #waitForMeta()}
310    * in that it will go ahead and verify the location gotten from ZooKeeper by
311    * trying to use returned connection.
312    * @param timeout maximum time to wait for meta availability, in milliseconds
313    * @return location of meta
314    * @throws InterruptedException if interrupted while waiting
315    * @throws IOException unexpected exception connecting to meta server
316    * @throws NotAllMetaRegionsOnlineException if meta not available before
317    *                                          timeout
318    */
319   public HServerAddress waitForMeta(long timeout)
320   throws InterruptedException, IOException, NotAllMetaRegionsOnlineException {
321     long stop = System.currentTimeMillis() + timeout;
322     synchronized (metaAvailable) {
323       if (getMetaServerConnection(true) != null) {
324         return metaLocation;
325       }
326       while(!stopped && !metaAvailable.get() &&
327           (timeout == 0 || System.currentTimeMillis() < stop)) {
328         metaAvailable.wait(timeout);
329       }
330       if (getMetaServerConnection(true) == null) {
331         throw new NotAllMetaRegionsOnlineException(
332             "Timed out (" + timeout + "ms)");
333       }
334       return metaLocation;
335     }
336   }
337 
338   /**
339    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
340    * waiting up to the specified timeout for availability.
341    * @see #waitForMeta(long) for additional information
342    * @return connection to server hosting meta
343    * @throws InterruptedException
344    * @throws NotAllMetaRegionsOnlineException if timed out waiting
345    * @throws IOException
346    */
347   public HRegionInterface waitForMetaServerConnection(long timeout)
348   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
349     return getCachedConnection(waitForMeta(timeout));
350   }
351 
352   /**
353    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
354    * waiting up to the specified timeout for availability.
355    * @see #waitForMeta(long) for additional information
356    * @return connection to server hosting meta
357    * @throws NotAllMetaRegionsOnlineException if timed out or interrupted
358    * @throws IOException
359    */
360   public HRegionInterface waitForMetaServerConnectionDefault()
361   throws NotAllMetaRegionsOnlineException, IOException {
362     try {
363       return getCachedConnection(waitForMeta(defaultTimeout));
364     } catch (InterruptedException e) {
365       throw new NotAllMetaRegionsOnlineException("Interrupted");
366     }
367   }
368 
369   private void resetMetaLocation() {
370     LOG.info("Current cached META location is not valid, resetting");
371     this.metaAvailable.set(false);
372   }
373 
374   private void setMetaLocation(HServerAddress metaLocation) {
375     metaAvailable.set(true);
376     this.metaLocation = metaLocation;
377     // no synchronization because these are private and already under lock
378     metaAvailable.notifyAll();
379   }
380 
381   private HRegionInterface getCachedConnection(HServerAddress address)
382   throws IOException {
383     HRegionInterface protocol = null;
384     try {
385       protocol = connection.getHRegionConnection(address, false);
386     } catch (RetriesExhaustedException e) {
387       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
388         // Catch this; presume it means the cached connection has gone bad.
389       } else {
390         throw e;
391       }
392     } catch (SocketTimeoutException e) {
393       // We were passed the wrong address.  Return 'protocol' == null.
394       LOG.debug("Timed out connecting to " + address);
395     } catch (IOException ioe) {
396       Throwable cause = ioe.getCause();
397       if (cause != null && cause instanceof EOFException) {
398         // Catch. Other end disconnected us.
399       } else if (cause != null && cause.getMessage() != null &&
400         cause.getMessage().toLowerCase().contains("connection reset")) {
401         // Catch. Connection reset.
402       } else {
403         throw ioe;
404       }
405       
406     }
407     return protocol;
408   }
409 
410   private boolean verifyRegionLocation(HRegionInterface metaServer,
411       final HServerAddress address,
412       byte [] regionName)
413   throws IOException {
414     if (metaServer == null) {
415       LOG.info("Passed metaserver is null");
416       return false;
417     }
418     Throwable t = null;
419     try {
420       return metaServer.getRegionInfo(regionName) != null;
421     } catch (ConnectException e) {
422       t = e;
423     } catch (RemoteException e) {
424       IOException ioe = e.unwrapRemoteException();
425       if (ioe instanceof NotServingRegionException) {
426         t = ioe;
427       } else {
428         throw e;
429       }
430     } catch (IOException e) {
431       Throwable cause = e.getCause();
432       if (cause != null && cause instanceof EOFException) {
433         t = cause;
434       } else if (cause != null && cause.getMessage() != null
435           && cause.getMessage().contains("Connection reset")) {
436         t = cause;
437       } else {
438         throw e;
439       }
440     }
441     LOG.info("Failed verification of " + Bytes.toString(regionName) +
442       " at address=" + address + "; " + t);
443     return false;
444   }
445 
446   /**
447    * Verify <code>-ROOT-</code> is deployed and accessible.
448    * @param timeout How long to wait on zk for root address (passed through to
449    * the internal call to {@link #waitForRootServerConnection(long)}.
450    * @return True if the <code>-ROOT-</code> location is healthy.
451    * @throws IOException
452    * @throws InterruptedException 
453    */
454   public boolean verifyRootRegionLocation(final long timeout)
455   throws InterruptedException, IOException {
456     HRegionInterface connection = null;
457     try {
458       connection = waitForRootServerConnection(timeout);
459     } catch (NotAllMetaRegionsOnlineException e) {
460       // Pass
461     } catch (org.apache.hadoop.hbase.ipc.ServerNotRunningException e) {
462       // Pass -- remote server is not up so can't be carrying root
463     } catch (IOException e) {
464       // Unexpected exception
465       throw e;
466     }
467     return (connection == null)? false:
468       verifyRegionLocation(connection,this.rootRegionTracker.getRootRegionLocation(),
469         HRegionInfo.ROOT_REGIONINFO.getRegionName());
470   }
471 
472   /**
473    * Verify <code>.META.</code> is deployed and accessible.
474    * @param timeout How long to wait on zk for <code>.META.</code> address
475    * (passed through to the internal call to {@link #waitForMetaServerConnection(long)}.
476    * @return True if the <code>.META.</code> location is healthy.
477    * @throws IOException Some unexpected IOE.
478    * @throws InterruptedException
479    */
480   public boolean verifyMetaRegionLocation(final long timeout)
481   throws InterruptedException, IOException {
482     return getMetaServerConnection(true) != null;
483   }
484 
485   MetaNodeTracker getMetaNodeTracker() {
486     return this.metaNodeTracker;
487   }
488 
489   public HConnection getConnection() {
490     return this.connection;
491   }
492 }