View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional information regarding
4    * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
5    * "License"); you may not use this file except in compliance with the License. You may obtain a
6    * copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
7    * law or agreed to in writing, software distributed under the License is distributed on an "AS IS"
8    * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License
9    * for the specific language governing permissions and limitations under the License.
10   */
11  
12  package org.apache.hadoop.hbase.coordination;
13  
14  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
15  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
16  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
17  
18  import java.io.IOException;
19  import java.util.List;
20  
21  import org.apache.commons.logging.Log;
22  import org.apache.commons.logging.LogFactory;
23  import org.apache.hadoop.hbase.CoordinatedStateManager;
24  import org.apache.hadoop.hbase.HRegionInfo;
25  import org.apache.hadoop.hbase.RegionTransition;
26  import org.apache.hadoop.hbase.ServerName;
27  import org.apache.hadoop.hbase.coordination.SplitTransactionCoordination;
28  import org.apache.hadoop.hbase.executor.EventType;
29  import org.apache.hadoop.hbase.regionserver.HRegion;
30  import org.apache.hadoop.hbase.regionserver.RegionServerServices;
31  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
32  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
33  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
34  import org.apache.zookeeper.KeeperException;
35  import org.apache.zookeeper.data.Stat;
36  
37  public class ZKSplitTransactionCoordination implements SplitTransactionCoordination {
38  
39    private CoordinatedStateManager coordinationManager;
40    private final ZooKeeperWatcher watcher;
41  
42    private static final Log LOG = LogFactory.getLog(ZKSplitTransactionCoordination.class);
43  
44    public ZKSplitTransactionCoordination(CoordinatedStateManager coordinationProvider,
45        ZooKeeperWatcher watcher) {
46      this.coordinationManager = coordinationProvider;
47      this.watcher = watcher;
48    }
49  
50    /**
51     * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region. Create it
52     * ephemeral in case regionserver dies mid-split.
53     * <p>
54     * Does not transition nodes from other states. If a node already exists for this region, an
55     * Exception will be thrown.
56     * @param parent region to be created as offline
57     * @param serverName server event originates from
58     * @param hri_a daughter region
59     * @param hri_b daughter region
60     * @throws IOException
61     */
62  
63    @Override
64    public void startSplitTransaction(HRegion parent, ServerName serverName, HRegionInfo hri_a,
65        HRegionInfo hri_b) throws IOException {
66  
67      HRegionInfo region = parent.getRegionInfo();
68      try {
69  
70        LOG.debug(watcher.prefix("Creating ephemeral node for " + region.getEncodedName()
71            + " in PENDING_SPLIT state"));
72        byte[] payload = HRegionInfo.toDelimitedByteArray(hri_a, hri_b);
73        RegionTransition rt =
74            RegionTransition.createRegionTransition(RS_ZK_REQUEST_REGION_SPLIT,
75              region.getRegionName(), serverName, payload);
76        String node = ZKAssign.getNodeName(watcher, region.getEncodedName());
77        if (!ZKUtil.createEphemeralNodeAndWatch(watcher, node, rt.toByteArray())) {
78          throw new IOException("Failed create of ephemeral " + node);
79        }
80  
81      } catch (KeeperException e) {
82        throw new IOException("Failed creating PENDING_SPLIT znode on "
83            + parent.getRegionNameAsString(), e);
84      }
85  
86    }
87  
88    /**
89     * Transitions an existing ephemeral node for the specified region which is currently in the begin
90     * state to be in the end state. Master cleans up the final SPLIT znode when it reads it (or if we
91     * crash, zk will clean it up).
92     * <p>
93     * Does not transition nodes from other states. If for some reason the node could not be
94     * transitioned, the method returns -1. If the transition is successful, the version of the node
95     * after transition is returned.
96     * <p>
97     * This method can fail and return false for three different reasons:
98     * <ul>
99     * <li>Node for this region does not exist</li>
100    * <li>Node for this region is not in the begin state</li>
101    * <li>After verifying the begin state, update fails because of wrong version (this should never
102    * actually happen since an RS only does this transition following a transition to the begin
103    * state. If two RS are conflicting, one would fail the original transition to the begin state and
104    * not this transition)</li>
105    * </ul>
106    * <p>
107    * Does not set any watches.
108    * <p>
109    * This method should only be used by a RegionServer when splitting a region.
110    * @param parent region to be transitioned to opened
111    * @param a Daughter a of split
112    * @param b Daughter b of split
113    * @param serverName server event originates from
114    * @param std split transaction details
115    * @param beginState the expected current state the znode should be
116    * @param endState the state to be transition to
117    * @return version of node after transition, -1 if unsuccessful transition
118    * @throws IOException
119    */
120 
121   private int transitionSplittingNode(HRegionInfo parent, HRegionInfo a, HRegionInfo b,
122       ServerName serverName, SplitTransactionDetails std, final EventType beginState,
123       final EventType endState) throws IOException {
124     ZkSplitTransactionDetails zstd = (ZkSplitTransactionDetails) std;
125     byte[] payload = HRegionInfo.toDelimitedByteArray(a, b);
126     try {
127       return ZKAssign.transitionNode(watcher, parent, serverName, beginState, endState,
128         zstd.getZnodeVersion(), payload);
129     } catch (KeeperException e) {
130       throw new IOException(
131           "Failed transition of splitting node " + parent.getRegionNameAsString(), e);
132     }
133   }
134 
135   /**
136    * Wait for the splitting node to be transitioned from pending_split to splitting by master.
137    * That's how we are sure master has processed the event and is good with us to move on. If we
138    * don't get any update, we periodically transition the node so that master gets the callback. If
139    * the node is removed or is not in pending_split state any more, we abort the split.
140    */
141   @Override
142   public void waitForSplitTransaction(final RegionServerServices services, HRegion parent,
143       HRegionInfo hri_a, HRegionInfo hri_b, SplitTransactionDetails sptd) throws IOException {
144     ZkSplitTransactionDetails zstd = (ZkSplitTransactionDetails) sptd;
145 
146     // After creating the split node, wait for master to transition it
147     // from PENDING_SPLIT to SPLITTING so that we can move on. We want master
148     // knows about it and won't transition any region which is splitting.
149     try {
150       int spins = 0;
151       Stat stat = new Stat();
152       ServerName expectedServer = coordinationManager.getServer().getServerName();
153       String node = parent.getRegionInfo().getEncodedName();
154       while (!(coordinationManager.getServer().isStopped() || services.isStopping())) {
155         if (spins % 5 == 0) {
156           LOG.debug("Still waiting for master to process " + "the pending_split for " + node);
157           SplitTransactionDetails temp = getDefaultDetails();
158           transitionSplittingNode(parent.getRegionInfo(), hri_a, hri_b, expectedServer, temp,
159             RS_ZK_REQUEST_REGION_SPLIT, RS_ZK_REQUEST_REGION_SPLIT);
160         }
161         Thread.sleep(100);
162         spins++;
163         byte[] data = ZKAssign.getDataNoWatch(watcher, node, stat);
164         if (data == null) {
165           throw new IOException("Data is null, splitting node " + node + " no longer exists");
166         }
167         RegionTransition rt = RegionTransition.parseFrom(data);
168         EventType et = rt.getEventType();
169         if (et == RS_ZK_REGION_SPLITTING) {
170           ServerName serverName = rt.getServerName();
171           if (!serverName.equals(expectedServer)) {
172             throw new IOException("Splitting node " + node + " is for " + serverName + ", not us "
173                 + expectedServer);
174           }
175           byte[] payloadOfSplitting = rt.getPayload();
176           List<HRegionInfo> splittingRegions =
177               HRegionInfo.parseDelimitedFrom(payloadOfSplitting, 0, payloadOfSplitting.length);
178           assert splittingRegions.size() == 2;
179           HRegionInfo a = splittingRegions.get(0);
180           HRegionInfo b = splittingRegions.get(1);
181           if (!(hri_a.equals(a) && hri_b.equals(b))) {
182             throw new IOException("Splitting node " + node + " is for " + a + ", " + b
183                 + ", not expected daughters: " + hri_a + ", " + hri_b);
184           }
185           // Master has processed it.
186           zstd.setZnodeVersion(stat.getVersion());
187           return;
188         }
189         if (et != RS_ZK_REQUEST_REGION_SPLIT) {
190           throw new IOException("Splitting node " + node + " moved out of splitting to " + et);
191         }
192       }
193       // Server is stopping/stopped
194       throw new IOException("Server is " + (services.isStopping() ? "stopping" : "stopped"));
195     } catch (Exception e) {
196       if (e instanceof InterruptedException) {
197         Thread.currentThread().interrupt();
198       }
199       throw new IOException("Failed getting SPLITTING znode on " + parent.getRegionNameAsString(),
200           e);
201     }
202   }
203 
204   /**
205    * Finish off split transaction, transition the zknode
206    * @param services Used to online/offline regions.
207    * @param a daughter region
208    * @param b daughter region
209    * @param std split transaction details
210    * @param parent
211    * @throws IOException If thrown, transaction failed. Call
212    *  {@link org.apache.hadoop.hbase.regionserver.SplitTransaction#rollback(
213    *  Server, RegionServerServices)}
214    */
215   @Override
216   public void completeSplitTransaction(final RegionServerServices services, HRegion a, HRegion b,
217       SplitTransactionDetails std, HRegion parent) throws IOException {
218     ZkSplitTransactionDetails zstd = (ZkSplitTransactionDetails) std;
219     // Tell master about split by updating zk. If we fail, abort.
220     if (coordinationManager.getServer() != null) {
221       try {
222         zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
223           b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
224           RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT));
225 
226         int spins = 0;
227         // Now wait for the master to process the split. We know it's done
228         // when the znode is deleted. The reason we keep tickling the znode is
229         // that it's possible for the master to miss an event.
230         do {
231           if (spins % 10 == 0) {
232             LOG.debug("Still waiting on the master to process the split for "
233                 + parent.getRegionInfo().getEncodedName());
234           }
235           Thread.sleep(100);
236           // When this returns -1 it means the znode doesn't exist
237           zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
238             b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
239             RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT));
240           spins++;
241         } while (zstd.getZnodeVersion() != -1 && !coordinationManager.getServer().isStopped()
242             && !services.isStopping());
243       } catch (Exception e) {
244         if (e instanceof InterruptedException) {
245           Thread.currentThread().interrupt();
246         }
247         throw new IOException("Failed telling master about split", e);
248       }
249     }
250 
251     // Leaving here, the splitdir with its dross will be in place but since the
252     // split was successful, just leave it; it'll be cleaned when parent is
253     // deleted and cleaned up.
254   }
255 
256   @Override
257   public void clean(final HRegionInfo hri) {
258     try {
259       // Only delete if its in expected state; could have been hijacked.
260       if (!ZKAssign.deleteNode(coordinationManager.getServer().getZooKeeper(),
261         hri.getEncodedName(), RS_ZK_REQUEST_REGION_SPLIT, coordinationManager.getServer()
262             .getServerName())) {
263         ZKAssign.deleteNode(coordinationManager.getServer().getZooKeeper(), hri.getEncodedName(),
264           RS_ZK_REGION_SPLITTING, coordinationManager.getServer().getServerName());
265       }
266     } catch (KeeperException.NoNodeException e) {
267       LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
268     } catch (KeeperException e) {
269       coordinationManager.getServer().abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
270     }
271   }
272 
273   /**
274    * ZK-based implementation. Has details about whether the state transition should be reflected in
275    * ZK, as well as expected version of znode.
276    */
277   public static class ZkSplitTransactionDetails implements
278       SplitTransactionCoordination.SplitTransactionDetails {
279     private int znodeVersion;
280 
281     public ZkSplitTransactionDetails() {
282     }
283 
284     /**
285      * @return znode current version
286      */
287     public int getZnodeVersion() {
288       return znodeVersion;
289     }
290 
291     /**
292      * @param znodeVersion znode new version
293      */
294     public void setZnodeVersion(int znodeVersion) {
295       this.znodeVersion = znodeVersion;
296     }
297   }
298 
299   @Override
300   public SplitTransactionDetails getDefaultDetails() {
301     ZkSplitTransactionDetails zstd = new ZkSplitTransactionDetails();
302     zstd.setZnodeVersion(-1);
303     return zstd;
304   }
305 
306   @Override
307   public int processTransition(HRegionInfo p, HRegionInfo hri_a, HRegionInfo hri_b, ServerName sn,
308       SplitTransactionDetails std) throws IOException {
309     return transitionSplittingNode(p, hri_a, hri_b, sn, std, RS_ZK_REQUEST_REGION_SPLIT,
310       RS_ZK_REGION_SPLITTING);
311 
312   }
313 }