View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver.handler;
21  
22  import java.io.IOException;
23  import java.util.concurrent.atomic.AtomicBoolean;
24  
25  import org.apache.commons.logging.Log;
26  import org.apache.commons.logging.LogFactory;
27  import org.apache.hadoop.hbase.HRegionInfo;
28  import org.apache.hadoop.hbase.Server;
29  import org.apache.hadoop.hbase.executor.EventHandler;
30  import org.apache.hadoop.hbase.regionserver.HRegion;
31  import org.apache.hadoop.hbase.regionserver.RegionServerServices;
32  import org.apache.hadoop.hbase.util.CancelableProgressable;
33  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
34  import org.apache.zookeeper.KeeperException;
35  
36  /**
37   * Handles opening of a region on a region server.
38   * <p>
39   * This is executed after receiving an OPEN RPC from the master or client.
40   */
41  public class OpenRegionHandler extends EventHandler {
42    private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
43  
44    private final RegionServerServices rsServices;
45  
46    private final HRegionInfo regionInfo;
47  
48    // We get version of our znode at start of open process and monitor it across
49    // the total open. We'll fail the open if someone hijacks our znode; we can
50    // tell this has happened if version is not as expected.
51    private volatile int version = -1;
52  
53    public OpenRegionHandler(final Server server,
54        final RegionServerServices rsServices, HRegionInfo regionInfo) {
55      this(server, rsServices, regionInfo, EventType.M_RS_OPEN_REGION);
56    }
57  
58    protected OpenRegionHandler(final Server server,
59        final RegionServerServices rsServices, final HRegionInfo regionInfo,
60        EventType eventType) {
61      super(server, eventType);
62      this.rsServices = rsServices;
63      this.regionInfo = regionInfo;
64    }
65  
66    public HRegionInfo getRegionInfo() {
67      return regionInfo;
68    }
69  
70    @Override
71    public void process() throws IOException {
72      final String name = regionInfo.getRegionNameAsString();
73      LOG.debug("Processing open of " + name);
74      if (this.server.isStopped() || this.rsServices.isStopping()) {
75        LOG.info("Server stopping or stopped, skipping open of " + name);
76        return;
77      }
78      final String encodedName = regionInfo.getEncodedName();
79  
80      // Check that this region is not already online
81      HRegion region = this.rsServices.getFromOnlineRegions(encodedName);
82      if (region != null) {
83        LOG.warn("Attempted open of " + name +
84          " but already online on this server");
85        return;
86      }
87  
88      // If fails, just return.  Someone stole the region from under us.
89      // Calling transitionZookeeperOfflineToOpening initalizes this.version.
90      if (!transitionZookeeperOfflineToOpening(encodedName)) {
91        LOG.warn("Region was hijacked? It no longer exists, encodedName=" +
92          encodedName);
93        return;
94      }
95  
96      // Open region.  After a successful open, failures in subsequent processing
97      // needs to do a close as part of cleanup.
98      region = openRegion();
99      if (region == null) return;
100     boolean failed = true;
101     if (tickleOpening("post_region_open")) {
102       if (updateMeta(region)) failed = false;
103     }
104 
105     if (failed || this.server.isStopped() || this.rsServices.isStopping()) {
106       cleanupFailedOpen(region);
107       return;
108     }
109 
110     if (!transitionToOpened(region)) {
111       cleanupFailedOpen(region);
112       return;
113     }
114 
115     // Done!  Successful region open
116     LOG.debug("Opened " + name);
117   }
118 
119   /**
120    * Update ZK, ROOT or META.  This can take a while if for example the
121    * .META. is not available -- if server hosting .META. crashed and we are
122    * waiting on it to come back -- so run in a thread and keep updating znode
123    * state meantime so master doesn't timeout our region-in-transition.
124    * Caller must cleanup region if this fails.
125    */
126   private boolean updateMeta(final HRegion r) {
127     if (this.server.isStopped() || this.rsServices.isStopping()) {
128       return false;
129     }
130     // Object we do wait/notify on.  Make it boolean.  If set, we're done.
131     // Else, wait.
132     final AtomicBoolean signaller = new AtomicBoolean(false);
133     PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
134       this.server, this.rsServices, signaller);
135     t.start();
136     int assignmentTimeout = this.server.getConfiguration().
137       getInt("hbase.master.assignment.timeoutmonitor.period", 10000);
138     // Total timeout for meta edit.  If we fail adding the edit then close out
139     // the region and let it be assigned elsewhere.
140     long timeout = assignmentTimeout * 10;
141     long now = System.currentTimeMillis();
142     long endTime = now + timeout;
143     // Let our period at which we update OPENING state to be be 1/3rd of the
144     // regions-in-transition timeout period.
145     long period = Math.max(1, assignmentTimeout/ 3);
146     long lastUpdate = now;
147     while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
148         !this.rsServices.isStopping() && (endTime > now)) {
149       long elapsed = now - lastUpdate;
150       if (elapsed > period) {
151         // Only tickle OPENING if postOpenDeployTasks is taking some time.
152         lastUpdate = now;
153         tickleOpening("post_open_deploy");
154       }
155       synchronized (signaller) {
156         try {
157           signaller.wait(period);
158         } catch (InterruptedException e) {
159           // Go to the loop check.
160         }
161       }
162       now = System.currentTimeMillis();
163     }
164     // Is thread still alive?  We may have left above loop because server is
165     // stopping or we timed out the edit.  Is so, interrupt it.
166     if (t.isAlive()) {
167       if (!signaller.get()) {
168         // Thread still running; interrupt
169         LOG.debug("Interrupting thread " + t);
170         t.interrupt();
171       }
172       try {
173         t.join();
174       } catch (InterruptedException ie) {
175         LOG.warn("Interrupted joining " +
176           r.getRegionInfo().getRegionNameAsString(), ie);
177         Thread.currentThread().interrupt();
178       }
179     }
180     // Was there an exception opening the region?  This should trigger on
181     // InterruptedException too.  If so, we failed.
182     return !t.interrupted() && t.getException() == null;
183   }
184 
185   /**
186    * Thread to run region post open tasks.  Call {@link #getException()} after
187    * the thread finishes to check for exceptions running
188    * {@link RegionServerServices#postOpenDeployTasks(HRegion, org.apache.hadoop.hbase.catalog.CatalogTracker, boolean)}.
189    */
190   static class PostOpenDeployTasksThread extends Thread {
191     private Exception exception = null;
192     private final Server server;
193     private final RegionServerServices services;
194     private final HRegion region;
195     private final AtomicBoolean signaller;
196 
197     PostOpenDeployTasksThread(final HRegion region, final Server server,
198         final RegionServerServices services, final AtomicBoolean signaller) {
199       super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
200       this.setDaemon(true);
201       this.server = server;
202       this.services = services;
203       this.region = region;
204       this.signaller = signaller;
205     }
206 
207     public void run() {
208       try {
209         this.services.postOpenDeployTasks(this.region,
210           this.server.getCatalogTracker(), false);
211       } catch (Exception e) {
212         LOG.warn("Exception running postOpenDeployTasks; region=" +
213           this.region.getRegionInfo().getEncodedName(), e);
214         this.exception = e;
215       }
216       // We're done.  Set flag then wake up anyone waiting on thread to complete.
217       this.signaller.set(true);
218       synchronized (this.signaller) {
219         this.signaller.notify();
220       }
221     }
222 
223     /**
224      * @return Null or the run exception; call this method after thread is done.
225      */
226     Exception getException() {
227       return this.exception;
228     }
229   }
230 
231   /**
232    * @param r Region we're working on.
233    * @return Transition znode to OPENED state.
234    * @throws IOException 
235    */
236   private boolean transitionToOpened(final HRegion r) throws IOException {
237     boolean result = false;
238     HRegionInfo hri = r.getRegionInfo();
239     final String name = hri.getRegionNameAsString();
240     // Finally, Transition ZK node to OPENED
241     try {
242       if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), hri,
243           this.server.getServerName(), this.version) == -1) {
244         LOG.warn("Completed the OPEN of region " + name +
245           " but when transitioning from " +
246           " OPENING to OPENED got a version mismatch, someone else clashed " +
247           "so now unassigning -- closing region");
248       } else {
249         result = true;
250       }
251     } catch (KeeperException e) {
252       LOG.error("Failed transitioning node " + name +
253         " from OPENING to OPENED -- closing region", e);
254     }
255     return result;
256   }
257 
258   /**
259    * @return Instance of HRegion if successful open else null.
260    */
261   HRegion openRegion() {
262     HRegion region = null;
263     try {
264       // Instantiate the region.  This also periodically tickles our zk OPENING
265       // state so master doesn't timeout this region in transition.
266       region = HRegion.openHRegion(this.regionInfo, this.rsServices.getWAL(),
267         this.server.getConfiguration(), this.rsServices.getFlushRequester(),
268         new CancelableProgressable() {
269           public boolean progress() {
270             // We may lose the znode ownership during the open.  Currently its
271             // too hard interrupting ongoing region open.  Just let it complete
272             // and check we still have the znode after region open.
273             return tickleOpening("open_region_progress");
274           }
275         });
276     } catch (IOException e) {
277       // We failed open.  Let our znode expire in regions-in-transition and
278       // Master will assign elsewhere.  Presumes nothing to close.
279       LOG.error("Failed open of region=" +
280         this.regionInfo.getRegionNameAsString(), e);
281     }
282     return region;
283   }
284 
285   private void cleanupFailedOpen(final HRegion region) throws IOException {
286     if (region != null) region.close();
287     this.rsServices.removeFromOnlineRegions(regionInfo.getEncodedName());
288   }
289 
290   /**
291    * Transition ZK node from OFFLINE to OPENING.
292    * @param encodedName Name of the znode file (Region encodedName is the znode
293    * name).
294    * @return True if successful transition.
295    */
296   boolean transitionZookeeperOfflineToOpening(final String encodedName) {
297     // TODO: should also handle transition from CLOSED?
298     try {
299       // Initialize the znode version.
300       this.version =
301         ZKAssign.transitionNodeOpening(server.getZooKeeper(),
302           regionInfo, server.getServerName());
303     } catch (KeeperException e) {
304       LOG.error("Error transition from OFFLINE to OPENING for region=" +
305         encodedName, e);
306     }
307     boolean b = isGoodVersion();
308     if (!b) {
309       LOG.warn("Failed transition from OFFLINE to OPENING for region=" +
310         encodedName);
311     }
312     return b;
313   }
314 
315   /**
316    * Update our OPENING state in zookeeper.
317    * Do this so master doesn't timeout this region-in-transition.
318    * @param context Some context to add to logs if failure
319    * @return True if successful transition.
320    */
321   boolean tickleOpening(final String context) {
322     // If previous checks failed... do not try again.
323     if (!isGoodVersion()) return false;
324     String encodedName = this.regionInfo.getEncodedName();
325     try {
326       this.version =
327         ZKAssign.retransitionNodeOpening(server.getZooKeeper(),
328           this.regionInfo, this.server.getServerName(), this.version);
329     } catch (KeeperException e) {
330       server.abort("Exception refreshing OPENING; region=" + encodedName +
331         ", context=" + context, e);
332       this.version = -1;
333     }
334     boolean b = isGoodVersion();
335     if (!b) {
336       LOG.warn("Failed refreshing OPENING; region=" + encodedName +
337         ", context=" + context);
338     }
339     return b;
340   }
341 
342   private boolean isGoodVersion() {
343     return this.version != -1;
344   }
345 }