View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver.handler;
21  
22  import java.io.IOException;
23  import java.util.concurrent.atomic.AtomicBoolean;
24  
25  import org.apache.commons.logging.Log;
26  import org.apache.commons.logging.LogFactory;
27  import org.apache.hadoop.hbase.HRegionInfo;
28  import org.apache.hadoop.hbase.Server;
29  import org.apache.hadoop.hbase.executor.EventHandler;
30  import org.apache.hadoop.hbase.regionserver.HRegion;
31  import org.apache.hadoop.hbase.regionserver.RegionServerServices;
32  import org.apache.hadoop.hbase.util.CancelableProgressable;
33  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
34  import org.apache.zookeeper.KeeperException;
35  
36  /**
37   * Handles opening of a region on a region server.
38   * <p>
39   * This is executed after receiving an OPEN RPC from the master or client.
40   */
41  public class OpenRegionHandler extends EventHandler {
42    private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
43  
44    private final RegionServerServices rsServices;
45  
46    private final HRegionInfo regionInfo;
47  
48    // We get version of our znode at start of open process and monitor it across
49    // the total open. We'll fail the open if someone hijacks our znode; we can
50    // tell this has happened if version is not as expected.
51    private volatile int version = -1;
52  
53    public OpenRegionHandler(final Server server,
54        final RegionServerServices rsServices, HRegionInfo regionInfo) {
55      this(server, rsServices, regionInfo, EventType.M_RS_OPEN_REGION);
56    }
57  
58    protected OpenRegionHandler(final Server server,
59        final RegionServerServices rsServices, final HRegionInfo regionInfo,
60        EventType eventType) {
61      super(server, eventType);
62      this.rsServices = rsServices;
63      this.regionInfo = regionInfo;
64    }
65  
66    public HRegionInfo getRegionInfo() {
67      return regionInfo;
68    }
69  
70    @Override
71    public void process() throws IOException {
72      final String name = regionInfo.getRegionNameAsString();
73      LOG.debug("Processing open of " + name);
74      if (this.server.isStopped() || this.rsServices.isStopping()) {
75        LOG.info("Server stopping or stopped, skipping open of " + name);
76        return;
77      }
78      final String encodedName = regionInfo.getEncodedName();
79  
80      // Check that this region is not already online
81      HRegion region = this.rsServices.getFromOnlineRegions(encodedName);
82      if (region != null) {
83        LOG.warn("Attempted open of " + name +
84          " but already online on this server");
85        return;
86      }
87  
88      // If fails, just return.  Someone stole the region from under us.
89      // Calling transitionZookeeperOfflineToOpening initalizes this.version.
90      if (!transitionZookeeperOfflineToOpening(encodedName)) return;
91  
92      // Open region.  After a successful open, failures in subsequent processing
93      // needs to do a close as part of cleanup.
94      region = openRegion();
95      if (region == null) return;
96      boolean failed = true;
97      if (tickleOpening("post_region_open")) {
98        if (updateMeta(region)) failed = false;
99      }
100 
101     if (failed || this.server.isStopped() || this.rsServices.isStopping()) {
102       cleanupFailedOpen(region);
103       return;
104     }
105 
106     if (!transitionToOpened(region)) {
107       cleanupFailedOpen(region);
108       return;
109     }
110 
111     // Done!  Successful region open
112     LOG.debug("Opened " + name);
113   }
114 
115   /**
116    * Update ZK, ROOT or META.  This can take a while if for example the
117    * .META. is not available -- if server hosting .META. crashed and we are
118    * waiting on it to come back -- so run in a thread and keep updating znode
119    * state meantime so master doesn't timeout our region-in-transition.
120    * Caller must cleanup region if this fails.
121    */
122   private boolean updateMeta(final HRegion r) {
123     if (this.server.isStopped() || this.rsServices.isStopping()) {
124       return false;
125     }
126     // Object we do wait/notify on.  Make it boolean.  If set, we're done.
127     // Else, wait.
128     final AtomicBoolean signaller = new AtomicBoolean(false);
129     PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
130       this.server, this.rsServices, signaller);
131     t.start();
132     int assignmentTimeout = this.server.getConfiguration().
133       getInt("hbase.master.assignment.timeoutmonitor.period", 10000);
134     // Total timeout for meta edit.  If we fail adding the edit then close out
135     // the region and let it be assigned elsewhere.
136     long timeout = assignmentTimeout * 10;
137     long now = System.currentTimeMillis();
138     long endTime = now + timeout;
139     // Let our period at which we update OPENING state to be be 1/3rd of the
140     // regions-in-transition timeout period.
141     long period = Math.max(1, assignmentTimeout/ 3);
142     long lastUpdate = now;
143     while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
144         !this.rsServices.isStopping() && (endTime > now)) {
145       long elapsed = now - lastUpdate;
146       if (elapsed > period) {
147         // Only tickle OPENING if postOpenDeployTasks is taking some time.
148         lastUpdate = now;
149         tickleOpening("post_open_deploy");
150       }
151       synchronized (signaller) {
152         try {
153           signaller.wait(period);
154         } catch (InterruptedException e) {
155           // Go to the loop check.
156         }
157       }
158       now = System.currentTimeMillis();
159     }
160     // Is thread still alive?  We may have left above loop because server is
161     // stopping or we timed out the edit.  Is so, interrupt it.
162     if (t.isAlive()) {
163       if (!signaller.get()) {
164         // Thread still running; interrupt
165         LOG.debug("Interrupting thread " + t);
166         t.interrupt();
167       }
168       try {
169         t.join();
170       } catch (InterruptedException ie) {
171         LOG.warn("Interrupted joining " +
172           r.getRegionInfo().getRegionNameAsString(), ie);
173         Thread.currentThread().interrupt();
174       }
175     }
176     // Was there an exception opening the region?  This should trigger on
177     // InterruptedException too.  If so, we failed.
178     return !t.interrupted() && t.getException() == null;
179   }
180 
181   /**
182    * Thread to run region post open tasks.  Call {@link #getException()} after
183    * the thread finishes to check for exceptions running
184    * {@link RegionServerServices#postOpenDeployTasks(HRegion, org.apache.hadoop.hbase.catalog.CatalogTracker, boolean)}.
185    */
186   static class PostOpenDeployTasksThread extends Thread {
187     private Exception exception = null;
188     private final Server server;
189     private final RegionServerServices services;
190     private final HRegion region;
191     private final AtomicBoolean signaller;
192 
193     PostOpenDeployTasksThread(final HRegion region, final Server server,
194         final RegionServerServices services, final AtomicBoolean signaller) {
195       super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
196       this.setDaemon(true);
197       this.server = server;
198       this.services = services;
199       this.region = region;
200       this.signaller = signaller;
201     }
202 
203     public void run() {
204       try {
205         this.services.postOpenDeployTasks(this.region,
206           this.server.getCatalogTracker(), false);
207       } catch (Exception e) {
208         LOG.warn("Exception running postOpenDeployTasks; region=" +
209           this.region.getRegionInfo().getEncodedName(), e);
210         this.exception = e;
211       }
212       // We're done.  Set flag then wake up anyone waiting on thread to complete.
213       this.signaller.set(true);
214       synchronized (this.signaller) {
215         this.signaller.notify();
216       }
217     }
218 
219     /**
220      * @return Null or the run exception; call this method after thread is done.
221      */
222     Exception getException() {
223       return this.exception;
224     }
225   }
226 
227   /**
228    * @param r Region we're working on.
229    * @return Transition znode to OPENED state.
230    * @throws IOException 
231    */
232   private boolean transitionToOpened(final HRegion r) throws IOException {
233     boolean result = false;
234     HRegionInfo hri = r.getRegionInfo();
235     final String name = hri.getRegionNameAsString();
236     // Finally, Transition ZK node to OPENED
237     try {
238       if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), hri,
239           this.server.getServerName(), this.version) == -1) {
240         LOG.warn("Completed the OPEN of region " + name +
241           " but when transitioning from " +
242           " OPENING to OPENED got a version mismatch, someone else clashed " +
243           "so now unassigning -- closing region");
244       } else {
245         result = true;
246       }
247     } catch (KeeperException e) {
248       LOG.error("Failed transitioning node " + name +
249         " from OPENING to OPENED -- closing region", e);
250     }
251     return result;
252   }
253 
254   /**
255    * @return Instance of HRegion if successful open else null.
256    */
257   private HRegion openRegion() {
258     HRegion region = null;
259     try {
260       // Instantiate the region.  This also periodically tickles our zk OPENING
261       // state so master doesn't timeout this region in transition.
262       region = HRegion.openHRegion(this.regionInfo, this.rsServices.getWAL(),
263         this.server.getConfiguration(), this.rsServices.getFlushRequester(),
264         new CancelableProgressable() {
265           public boolean progress() {
266             // We may lose the znode ownership during the open.  Currently its
267             // too hard interrupting ongoing region open.  Just let it complete
268             // and check we still have the znode after region open.
269             return tickleOpening("open_region_progress");
270           }
271         });
272     } catch (IOException e) {
273       // We failed open.  Let our znode expire in regions-in-transition and
274       // Master will assign elsewhere.  Presumes nothing to close.
275       LOG.error("Failed open of region=" +
276         this.regionInfo.getRegionNameAsString(), e);
277     }
278     return region;
279   }
280 
281   private void cleanupFailedOpen(final HRegion region) throws IOException {
282     if (region != null) region.close();
283     this.rsServices.removeFromOnlineRegions(regionInfo.getEncodedName());
284   }
285 
286   /**
287    * Transition ZK node from OFFLINE to OPENING.
288    * @param encodedName Name of the znode file (Region encodedName is the znode
289    * name).
290    * @return True if successful transition.
291    */
292   boolean transitionZookeeperOfflineToOpening(final String encodedName) {
293     // TODO: should also handle transition from CLOSED?
294     try {
295       // Initialize the znode version.
296       this.version =
297         ZKAssign.transitionNodeOpening(server.getZooKeeper(),
298           regionInfo, server.getServerName());
299     } catch (KeeperException e) {
300       LOG.error("Error transition from OFFLINE to OPENING for region=" +
301         encodedName, e);
302     }
303     boolean b = isGoodVersion();
304     if (!b) {
305       LOG.warn("Failed transition from OFFLINE to OPENING for region=" +
306         encodedName);
307     }
308     return b;
309   }
310 
311   /**
312    * Update our OPENING state in zookeeper.
313    * Do this so master doesn't timeout this region-in-transition.
314    * @param context Some context to add to logs if failure
315    * @return True if successful transition.
316    */
317   boolean tickleOpening(final String context) {
318     // If previous checks failed... do not try again.
319     if (!isGoodVersion()) return false;
320     String encodedName = this.regionInfo.getEncodedName();
321     try {
322       this.version =
323         ZKAssign.retransitionNodeOpening(server.getZooKeeper(),
324           this.regionInfo, this.server.getServerName(), this.version);
325     } catch (KeeperException e) {
326       server.abort("Exception refreshing OPENING; region=" + encodedName +
327         ", context=" + context, e);
328       this.version = -1;
329     }
330     boolean b = isGoodVersion();
331     if (!b) {
332       LOG.warn("Failed refreshing OPENING; region=" + encodedName +
333         ", context=" + context);
334     }
335     return b;
336   }
337 
338   private boolean isGoodVersion() {
339     return this.version != -1;
340   }
341 }