View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
22  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
23  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
24  
25  import java.io.IOException;
26  import java.io.InterruptedIOException;
27  import java.util.ArrayList;
28  import java.util.List;
29  import java.util.ListIterator;
30  import java.util.Map;
31  import java.util.concurrent.Callable;
32  import java.util.concurrent.ExecutionException;
33  import java.util.concurrent.Executors;
34  import java.util.concurrent.Future;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.ThreadPoolExecutor;
37  import java.util.concurrent.TimeUnit;
38  
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.apache.hadoop.hbase.classification.InterfaceAudience;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HRegionInfo;
44  import org.apache.hadoop.hbase.RegionTransition;
45  import org.apache.hadoop.hbase.Server;
46  import org.apache.hadoop.hbase.ServerName;
47  import org.apache.hadoop.hbase.catalog.CatalogTracker;
48  import org.apache.hadoop.hbase.catalog.MetaEditor;
49  import org.apache.hadoop.hbase.client.Mutation;
50  import org.apache.hadoop.hbase.client.Put;
51  import org.apache.hadoop.hbase.executor.EventType;
52  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
53  import org.apache.hadoop.hbase.util.Bytes;
54  import org.apache.hadoop.hbase.util.CancelableProgressable;
55  import org.apache.hadoop.hbase.util.ConfigUtil;
56  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
57  import org.apache.hadoop.hbase.util.HasThread;
58  import org.apache.hadoop.hbase.util.PairOfSameType;
59  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
60  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
61  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
62  import org.apache.zookeeper.KeeperException;
63  import org.apache.zookeeper.KeeperException.NodeExistsException;
64  import org.apache.zookeeper.data.Stat;
65  
66  import com.google.common.util.concurrent.ThreadFactoryBuilder;
67  
68  /**
69   * Executes region split as a "transaction".  Call {@link #prepare()} to setup
70   * the transaction, {@link #execute(Server, RegionServerServices)} to run the
71   * transaction and {@link #rollback(Server, RegionServerServices)} to cleanup if execute fails.
72   *
73   * <p>Here is an example of how you would use this class:
74   * <pre>
75   *  SplitTransaction st = new SplitTransaction(this.conf, parent, midKey)
76   *  if (!st.prepare()) return;
77   *  try {
78   *    st.execute(server, services);
79   *  } catch (IOException ioe) {
80   *    try {
81   *      st.rollback(server, services);
82   *      return;
83   *    } catch (RuntimeException e) {
84   *      myAbortable.abort("Failed split, abort");
85   *    }
86   *  }
87   * </Pre>
88   * <p>This class is not thread safe.  Caller needs ensure split is run by
89   * one thread only.
90   */
91  @InterfaceAudience.Private
92  public class SplitTransaction {
93    private static final Log LOG = LogFactory.getLog(SplitTransaction.class);
94  
95    /*
96     * Region to split
97     */
98    private final HRegion parent;
99    private HRegionInfo hri_a;
100   private HRegionInfo hri_b;
101   private long fileSplitTimeout = 30000;
102   private int znodeVersion = -1;
103   boolean useZKForAssignment;
104 
105   /*
106    * Row to split around
107    */
108   private final byte [] splitrow;
109 
110   /**
111    * Types to add to the transaction journal.
112    * Each enum is a step in the split transaction. Used to figure how much
113    * we need to rollback.
114    */
115   static enum JournalEntryType {
116     /**
117      * Started
118      */
119     STARTED,
120     /**
121      * Prepared (after table lock)
122      */
123     PREPARED,
124     /**
125      * Before preSplit coprocessor hook
126      */
127     BEFORE_PRE_SPLIT_HOOK,
128     /**
129      * After preSplit coprocessor hook
130      */
131     AFTER_PRE_SPLIT_HOOK,
132     /**
133      * Set region as in transition, set it into SPLITTING state.
134      */
135     SET_SPLITTING_IN_ZK,
136     /**
137      * We created the temporary split data directory.
138      */
139     CREATE_SPLIT_DIR,
140     /**
141      * Closed the parent region.
142      */
143     CLOSED_PARENT_REGION,
144     /**
145      * The parent has been taken out of the server's online regions list.
146      */
147     OFFLINED_PARENT,
148     /**
149      * Started in on creation of the first daughter region.
150      */
151     STARTED_REGION_A_CREATION,
152     /**
153      * Started in on the creation of the second daughter region.
154      */
155     STARTED_REGION_B_CREATION,
156     /**
157      * Opened the first daughter region
158      */
159     OPENED_REGION_A,
160     /**
161      * Opened the second daughter region
162      */
163     OPENED_REGION_B,
164     /**
165      * Before postSplit coprocessor hook
166      */
167     BEFORE_POST_SPLIT_HOOK,
168     /**
169      * After postSplit coprocessor hook
170      */
171     AFTER_POST_SPLIT_HOOK,
172     /**
173      * Point of no return.
174      * If we got here, then transaction is not recoverable other than by
175      * crashing out the regionserver.
176      */
177     PONR
178   }
179 
180   static class JournalEntry {
181     public JournalEntryType type;
182     public long timestamp;
183 
184     public JournalEntry(JournalEntryType type) {
185       this(type, EnvironmentEdgeManager.currentTimeMillis());
186     }
187 
188     public JournalEntry(JournalEntryType type, long timestamp) {
189       this.type = type;
190       this.timestamp = timestamp;
191     }
192 
193     @Override
194     public String toString() {
195       StringBuilder sb = new StringBuilder();
196       sb.append(type);
197       sb.append(" at ");
198       sb.append(timestamp);
199       return sb.toString();
200     }
201   }
202 
203   /*
204    * Journal of how far the split transaction has progressed.
205    */
206   private final List<JournalEntry> journal = new ArrayList<JournalEntry>();
207 
208   /**
209    * Constructor
210    * @param r Region to split
211    * @param splitrow Row to split around
212    */
213   public SplitTransaction(final HRegion r, final byte [] splitrow) {
214     this.parent = r;
215     this.splitrow = splitrow;
216     this.journal.add(new JournalEntry(JournalEntryType.STARTED));
217   }
218 
219   /**
220    * Does checks on split inputs.
221    * @return <code>true</code> if the region is splittable else
222    * <code>false</code> if it is not (e.g. its already closed, etc.).
223    */
224   public boolean prepare() {
225     if (!this.parent.isSplittable()) return false;
226     // Split key can be null if this region is unsplittable; i.e. has refs.
227     if (this.splitrow == null) return false;
228     HRegionInfo hri = this.parent.getRegionInfo();
229     parent.prepareToSplit();
230     // Check splitrow.
231     byte [] startKey = hri.getStartKey();
232     byte [] endKey = hri.getEndKey();
233     if (Bytes.equals(startKey, splitrow) ||
234         !this.parent.getRegionInfo().containsRow(splitrow)) {
235       LOG.info("Split row is not inside region key range or is equal to " +
236           "startkey: " + Bytes.toStringBinary(this.splitrow));
237       return false;
238     }
239     long rid = getDaughterRegionIdTimestamp(hri);
240     this.hri_a = new HRegionInfo(hri.getTable(), startKey, this.splitrow, false, rid);
241     this.hri_b = new HRegionInfo(hri.getTable(), this.splitrow, endKey, false, rid);
242     this.journal.add(new JournalEntry(JournalEntryType.PREPARED));
243     return true;
244   }
245 
246   /**
247    * Calculate daughter regionid to use.
248    * @param hri Parent {@link HRegionInfo}
249    * @return Daughter region id (timestamp) to use.
250    */
251   private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) {
252     long rid = EnvironmentEdgeManager.currentTimeMillis();
253     // Regionid is timestamp.  Can't be less than that of parent else will insert
254     // at wrong location in hbase:meta (See HBASE-710).
255     if (rid < hri.getRegionId()) {
256       LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() +
257         " but current time here is " + rid);
258       rid = hri.getRegionId() + 1;
259     }
260     return rid;
261   }
262 
263   private static IOException closedByOtherException = new IOException(
264       "Failed to close region: already closed by another thread");
265 
266   /**
267    * Prepare the regions and region files.
268    * @param server Hosting server instance.  Can be null when testing (won't try
269    * and update in zk if a null server)
270    * @param services Used to online/offline regions.
271    * @throws IOException If thrown, transaction failed.
272    *    Call {@link #rollback(Server, RegionServerServices)}
273    * @return Regions created
274    */
275   /* package */PairOfSameType<HRegion> createDaughters(final Server server,
276       final RegionServerServices services) throws IOException {
277     LOG.info("Starting split of region " + this.parent);
278     if ((server != null && server.isStopped()) ||
279         (services != null && services.isStopping())) {
280       throw new IOException("Server is stopped or stopping");
281     }
282     assert !this.parent.lock.writeLock().isHeldByCurrentThread():
283       "Unsafe to hold write lock while performing RPCs";
284 
285     journal.add(new JournalEntry(JournalEntryType.BEFORE_PRE_SPLIT_HOOK));
286 
287     // Coprocessor callback
288     if (this.parent.getCoprocessorHost() != null) {
289       // TODO: Remove one of these
290       this.parent.getCoprocessorHost().preSplit();
291       this.parent.getCoprocessorHost().preSplit(this.splitrow);
292     }
293 
294     journal.add(new JournalEntry(JournalEntryType.AFTER_PRE_SPLIT_HOOK));
295 
296     // If true, no cluster to write meta edits to or to update znodes in.
297     boolean testing = server == null? true:
298         server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
299     this.fileSplitTimeout = testing ? this.fileSplitTimeout :
300         server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
301           this.fileSplitTimeout);
302 
303     PairOfSameType<HRegion> daughterRegions = stepsBeforePONR(server, services, testing);
304 
305     List<Mutation> metaEntries = new ArrayList<Mutation>();
306     if (this.parent.getCoprocessorHost() != null) {
307       if (this.parent.getCoprocessorHost().
308           preSplitBeforePONR(this.splitrow, metaEntries)) {
309         throw new IOException("Coprocessor bypassing region "
310             + this.parent.getRegionNameAsString() + " split.");
311       }
312       try {
313         for (Mutation p : metaEntries) {
314           HRegionInfo.parseRegionName(p.getRow());
315         }
316       } catch (IOException e) {
317         LOG.error("Row key of mutation from coprossor is not parsable as region name."
318             + "Mutations from coprocessor should only for hbase:meta table.");
319         throw e;
320       }
321     }
322 
323     // This is the point of no return.  Adding subsequent edits to .META. as we
324     // do below when we do the daughter opens adding each to .META. can fail in
325     // various interesting ways the most interesting of which is a timeout
326     // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR
327     // then subsequent failures need to crash out this regionserver; the
328     // server shutdown processing should be able to fix-up the incomplete split.
329     // The offlined parent will have the daughters as extra columns.  If
330     // we leave the daughter regions in place and do not remove them when we
331     // crash out, then they will have their references to the parent in place
332     // still and the server shutdown fixup of .META. will point to these
333     // regions.
334     // We should add PONR JournalEntry before offlineParentInMeta,so even if
335     // OfflineParentInMeta timeout,this will cause regionserver exit,and then
336     // master ServerShutdownHandler will fix daughter & avoid data loss. (See
337     // HBase-4562).
338     this.journal.add(new JournalEntry(JournalEntryType.PONR));
339 
340     // Edit parent in meta.  Offlines parent region and adds splita and splitb
341     // as an atomic update. See HBASE-7721. This update to META makes the region
342     // will determine whether the region is split or not in case of failures.
343     // If it is successful, master will roll-forward, if not, master will rollback
344     // and assign the parent region.
345     if (!testing && useZKForAssignment) {
346       if (metaEntries == null || metaEntries.isEmpty()) {
347         MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(), daughterRegions
348             .getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(), server
349             .getServerName());
350       } else {
351         offlineParentInMetaAndputMetaEntries(server.getCatalogTracker(), parent.getRegionInfo(),
352           daughterRegions.getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(),
353           server.getServerName(), metaEntries);
354       }
355     } else if (services != null && !useZKForAssignment) {
356       if (!services.reportRegionStateTransition(TransitionCode.SPLIT_PONR, parent.getRegionInfo(),
357         hri_a, hri_b)) {
358         // Passed PONR, let SSH clean it up
359         throw new IOException("Failed to notify master that split passed PONR: "
360             + parent.getRegionInfo().getRegionNameAsString());
361       }
362     }
363     return daughterRegions;
364   }
365 
366   public PairOfSameType<HRegion> stepsBeforePONR(final Server server,
367       final RegionServerServices services, boolean testing) throws IOException {
368     // Set ephemeral SPLITTING znode up in zk.  Mocked servers sometimes don't
369     // have zookeeper so don't do zk stuff if server or zookeeper is null
370     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
371       try {
372         createNodeSplitting(server.getZooKeeper(),
373           parent.getRegionInfo(), server.getServerName(), hri_a, hri_b);
374       } catch (KeeperException e) {
375         throw new IOException("Failed creating PENDING_SPLIT znode on " +
376           this.parent.getRegionNameAsString(), e);
377       }
378     } else if (services != null && !useZKForAssignment) {
379       if (!services.reportRegionStateTransition(TransitionCode.READY_TO_SPLIT,
380         parent.getRegionInfo(), hri_a, hri_b)) {
381         throw new IOException("Failed to get ok from master to split "
382             + parent.getRegionNameAsString());
383       }
384     }
385     this.journal.add(new JournalEntry(JournalEntryType.SET_SPLITTING_IN_ZK));
386     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
387       // After creating the split node, wait for master to transition it
388       // from PENDING_SPLIT to SPLITTING so that we can move on. We want master
389       // knows about it and won't transition any region which is splitting.
390       znodeVersion = getZKNode(server, services);
391     }
392 
393     this.parent.getRegionFileSystem().createSplitsDir();
394     this.journal.add(new JournalEntry(JournalEntryType.CREATE_SPLIT_DIR));
395 
396     Map<byte[], List<StoreFile>> hstoreFilesToSplit = null;
397     Exception exceptionToThrow = null;
398     try{
399       hstoreFilesToSplit = this.parent.close(false);
400     } catch (Exception e) {
401       exceptionToThrow = e;
402     }
403     if (exceptionToThrow == null && hstoreFilesToSplit == null) {
404       // The region was closed by a concurrent thread.  We can't continue
405       // with the split, instead we must just abandon the split.  If we
406       // reopen or split this could cause problems because the region has
407       // probably already been moved to a different server, or is in the
408       // process of moving to a different server.
409       exceptionToThrow = closedByOtherException;
410     }
411     if (exceptionToThrow != closedByOtherException) {
412       this.journal.add(new JournalEntry(JournalEntryType.CLOSED_PARENT_REGION));
413     }
414     if (exceptionToThrow != null) {
415       if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
416       throw new IOException(exceptionToThrow);
417     }
418     if (!testing) {
419       services.removeFromOnlineRegions(this.parent, null);
420     }
421     this.journal.add(new JournalEntry(JournalEntryType.OFFLINED_PARENT));
422 
423     // TODO: If splitStoreFiles were multithreaded would we complete steps in
424     // less elapsed time?  St.Ack 20100920
425     //
426     // splitStoreFiles creates daughter region dirs under the parent splits dir
427     // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will
428     // clean this up.
429     splitStoreFiles(hstoreFilesToSplit);
430 
431     // Log to the journal that we are creating region A, the first daughter
432     // region.  We could fail halfway through.  If we do, we could have left
433     // stuff in fs that needs cleanup -- a storefile or two.  Thats why we
434     // add entry to journal BEFORE rather than AFTER the change.
435     this.journal.add(new JournalEntry(JournalEntryType.STARTED_REGION_A_CREATION));
436     HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a);
437 
438     // Ditto
439     this.journal.add(new JournalEntry(JournalEntryType.STARTED_REGION_B_CREATION));
440     HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b);
441     return new PairOfSameType<HRegion>(a, b);
442   }
443 
444   /**
445    * Perform time consuming opening of the daughter regions.
446    * @param server Hosting server instance.  Can be null when testing (won't try
447    * and update in zk if a null server)
448    * @param services Used to online/offline regions.
449    * @param a first daughter region
450    * @param a second daughter region
451    * @throws IOException If thrown, transaction failed.
452    *          Call {@link #rollback(Server, RegionServerServices)}
453    */
454   /* package */void openDaughters(final Server server,
455       final RegionServerServices services, HRegion a, HRegion b)
456       throws IOException {
457     boolean stopped = server != null && server.isStopped();
458     boolean stopping = services != null && services.isStopping();
459     // TODO: Is this check needed here?
460     if (stopped || stopping) {
461       LOG.info("Not opening daughters " +
462           b.getRegionInfo().getRegionNameAsString() +
463           " and " +
464           a.getRegionInfo().getRegionNameAsString() +
465           " because stopping=" + stopping + ", stopped=" + stopped);
466     } else {
467       // Open daughters in parallel.
468       DaughterOpener aOpener = new DaughterOpener(server, a);
469       DaughterOpener bOpener = new DaughterOpener(server, b);
470       aOpener.start();
471       bOpener.start();
472       try {
473         aOpener.join();
474         if (aOpener.getException() == null) {
475           journal.add(new JournalEntry(JournalEntryType.OPENED_REGION_A));
476         }
477         bOpener.join();
478         if (bOpener.getException() == null) {
479           journal.add(new JournalEntry(JournalEntryType.OPENED_REGION_B));
480         }
481       } catch (InterruptedException e) {
482         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
483       }
484       if (aOpener.getException() != null) {
485         throw new IOException("Failed " +
486           aOpener.getName(), aOpener.getException());
487       }
488       if (bOpener.getException() != null) {
489         throw new IOException("Failed " +
490           bOpener.getName(), bOpener.getException());
491       }
492       if (services != null) {
493         try {
494           if (useZKForAssignment) {
495             // add 2nd daughter first (see HBASE-4335)
496             services.postOpenDeployTasks(b, server.getCatalogTracker());
497           } else if (!services.reportRegionStateTransition(TransitionCode.SPLIT,
498               parent.getRegionInfo(), hri_a, hri_b)) {
499             throw new IOException("Failed to report split region to master: "
500               + parent.getRegionInfo().getShortNameToLog());
501           }
502           // Should add it to OnlineRegions
503           services.addToOnlineRegions(b);
504           if (useZKForAssignment) {
505             services.postOpenDeployTasks(a, server.getCatalogTracker());
506           }
507           services.addToOnlineRegions(a);
508         } catch (KeeperException ke) {
509           throw new IOException(ke);
510         }
511       }
512     }
513   }
514 
515   /**
516    * Finish off split transaction, transition the zknode
517    * @param server Hosting server instance.  Can be null when testing (won't try
518    * and update in zk if a null server)
519    * @param services Used to online/offline regions.
520    * @param a first daughter region
521    * @param a second daughter region
522    * @throws IOException If thrown, transaction failed.
523    *          Call {@link #rollback(Server, RegionServerServices)}
524    */
525   /* package */void transitionZKNode(final Server server,
526       final RegionServerServices services, HRegion a, HRegion b)
527       throws IOException {
528     // Tell master about split by updating zk.  If we fail, abort.
529     if (server != null && server.getZooKeeper() != null) {
530       try {
531         this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
532           parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
533           server.getServerName(), this.znodeVersion,
534           RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
535 
536         int spins = 0;
537         // Now wait for the master to process the split. We know it's done
538         // when the znode is deleted. The reason we keep tickling the znode is
539         // that it's possible for the master to miss an event.
540         do {
541           if (spins % 10 == 0) {
542             LOG.debug("Still waiting on the master to process the split for " +
543                 this.parent.getRegionInfo().getEncodedName());
544           }
545           Thread.sleep(100);
546           // When this returns -1 it means the znode doesn't exist
547           this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
548             parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
549             server.getServerName(), this.znodeVersion,
550             RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT);
551           spins++;
552         } while (this.znodeVersion != -1 && !server.isStopped()
553             && !services.isStopping());
554       } catch (Exception e) {
555         if (e instanceof InterruptedException) {
556           Thread.currentThread().interrupt();
557         }
558         throw new IOException("Failed telling master about split", e);
559       }
560     }
561 
562     
563 
564     // Leaving here, the splitdir with its dross will be in place but since the
565     // split was successful, just leave it; it'll be cleaned when parent is
566     // deleted and cleaned up.
567   }
568 
569   /**
570    * Wait for the splitting node to be transitioned from pending_split
571    * to splitting by master. That's how we are sure master has processed
572    * the event and is good with us to move on. If we don't get any update,
573    * we periodically transition the node so that master gets the callback.
574    * If the node is removed or is not in pending_split state any more,
575    * we abort the split.
576    */
577   private int getZKNode(final Server server,
578       final RegionServerServices services) throws IOException {
579     // Wait for the master to process the pending_split.
580     try {
581       int spins = 0;
582       Stat stat = new Stat();
583       ZooKeeperWatcher zkw = server.getZooKeeper();
584       ServerName expectedServer = server.getServerName();
585       String node = parent.getRegionInfo().getEncodedName();
586       while (!(server.isStopped() || services.isStopping())) {
587         if (spins % 5 == 0) {
588           LOG.debug("Still waiting for master to process "
589             + "the pending_split for " + node);
590           transitionSplittingNode(zkw, parent.getRegionInfo(),
591             hri_a, hri_b, expectedServer, -1, RS_ZK_REQUEST_REGION_SPLIT,
592             RS_ZK_REQUEST_REGION_SPLIT);
593         }
594         Thread.sleep(100);
595         spins++;
596         byte [] data = ZKAssign.getDataNoWatch(zkw, node, stat);
597         if (data == null) {
598           throw new IOException("Data is null, splitting node "
599             + node + " no longer exists");
600         }
601         RegionTransition rt = RegionTransition.parseFrom(data);
602         EventType et = rt.getEventType();
603         if (et == RS_ZK_REGION_SPLITTING) {
604           ServerName serverName = rt.getServerName();
605           if (!serverName.equals(expectedServer)) {
606             throw new IOException("Splitting node " + node + " is for "
607               + serverName + ", not us " + expectedServer);
608           }
609           byte [] payloadOfSplitting = rt.getPayload();
610           List<HRegionInfo> splittingRegions = HRegionInfo.parseDelimitedFrom(
611             payloadOfSplitting, 0, payloadOfSplitting.length);
612           assert splittingRegions.size() == 2;
613           HRegionInfo a = splittingRegions.get(0);
614           HRegionInfo b = splittingRegions.get(1);
615           if (!(hri_a.equals(a) && hri_b.equals(b))) {
616             throw new IOException("Splitting node " + node + " is for " + a + ", "
617               + b + ", not expected daughters: " + hri_a + ", " + hri_b);
618           }
619           // Master has processed it.
620           return stat.getVersion();
621         }
622         if (et != RS_ZK_REQUEST_REGION_SPLIT) {
623           throw new IOException("Splitting node " + node
624             + " moved out of splitting to " + et);
625         }
626       }
627       // Server is stopping/stopped
628       throw new IOException("Server is "
629         + (services.isStopping() ? "stopping" : "stopped"));
630     } catch (Exception e) {
631       if (e instanceof InterruptedException) {
632         Thread.currentThread().interrupt();
633       }
634       throw new IOException("Failed getting SPLITTING znode on "
635         + parent.getRegionNameAsString(), e);
636     }
637   }
638 
639   /**
640    * Run the transaction.
641    * @param server Hosting server instance.  Can be null when testing (won't try
642    * and update in zk if a null server)
643    * @param services Used to online/offline regions.
644    * @throws IOException If thrown, transaction failed.
645    *          Call {@link #rollback(Server, RegionServerServices)}
646    * @return Regions created
647    * @throws IOException
648    * @see #rollback(Server, RegionServerServices)
649    */
650   public PairOfSameType<HRegion> execute(final Server server,
651       final RegionServerServices services)
652   throws IOException {
653     useZKForAssignment =
654         server == null ? true : ConfigUtil.useZKForAssignment(server.getConfiguration());
655     PairOfSameType<HRegion> regions = createDaughters(server, services);
656     if (this.parent.getCoprocessorHost() != null) {
657       this.parent.getCoprocessorHost().preSplitAfterPONR();
658     }
659     return stepsAfterPONR(server, services, regions);
660   }
661 
662   public PairOfSameType<HRegion> stepsAfterPONR(final Server server,
663       final RegionServerServices services, PairOfSameType<HRegion> regions)
664       throws IOException {
665     openDaughters(server, services, regions.getFirst(), regions.getSecond());
666     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
667       transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
668     }
669     journal.add(new JournalEntry(JournalEntryType.BEFORE_POST_SPLIT_HOOK));
670     // Coprocessor callback
671     if (this.parent.getCoprocessorHost() != null) {
672       this.parent.getCoprocessorHost().postSplit(regions.getFirst(), regions.getSecond());
673     }
674     journal.add(new JournalEntry(JournalEntryType.AFTER_POST_SPLIT_HOOK));
675     return regions;
676   }
677 
678   private void offlineParentInMetaAndputMetaEntries(CatalogTracker catalogTracker,
679       HRegionInfo parent, HRegionInfo splitA, HRegionInfo splitB,
680       ServerName serverName, List<Mutation> metaEntries) throws IOException {
681     List<Mutation> mutations = metaEntries;
682     HRegionInfo copyOfParent = new HRegionInfo(parent);
683     copyOfParent.setOffline(true);
684     copyOfParent.setSplit(true);
685 
686     //Put for parent
687     Put putParent = MetaEditor.makePutFromRegionInfo(copyOfParent);
688     MetaEditor.addDaughtersToPut(putParent, splitA, splitB);
689     mutations.add(putParent);
690     
691     //Puts for daughters
692     Put putA = MetaEditor.makePutFromRegionInfo(splitA);
693     Put putB = MetaEditor.makePutFromRegionInfo(splitB);
694 
695     addLocation(putA, serverName, 1); //these are new regions, openSeqNum = 1 is fine.
696     addLocation(putB, serverName, 1);
697     mutations.add(putA);
698     mutations.add(putB);
699     MetaEditor.mutateMetaTable(catalogTracker, mutations);
700   }
701 
702   public Put addLocation(final Put p, final ServerName sn, long openSeqNum) {
703     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
704       Bytes.toBytes(sn.getHostAndPort()));
705     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
706       Bytes.toBytes(sn.getStartcode()));
707     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SEQNUM_QUALIFIER,
708         Bytes.toBytes(openSeqNum));
709     return p;
710   }
711 
712   /*
713    * Open daughter region in its own thread.
714    * If we fail, abort this hosting server.
715    */
716   class DaughterOpener extends HasThread {
717     private final Server server;
718     private final HRegion r;
719     private Throwable t = null;
720 
721     DaughterOpener(final Server s, final HRegion r) {
722       super((s == null? "null-services": s.getServerName()) +
723         "-daughterOpener=" + r.getRegionInfo().getEncodedName());
724       setDaemon(true);
725       this.server = s;
726       this.r = r;
727     }
728 
729     /**
730      * @return Null if open succeeded else exception that causes us fail open.
731      * Call it after this thread exits else you may get wrong view on result.
732      */
733     Throwable getException() {
734       return this.t;
735     }
736 
737     @Override
738     public void run() {
739       try {
740         openDaughterRegion(this.server, r);
741       } catch (Throwable t) {
742         this.t = t;
743       }
744     }
745   }
746 
747   /**
748    * Open daughter regions, add them to online list and update meta.
749    * @param server
750    * @param daughter
751    * @throws IOException
752    * @throws KeeperException
753    */
754   void openDaughterRegion(final Server server, final HRegion daughter)
755   throws IOException, KeeperException {
756     HRegionInfo hri = daughter.getRegionInfo();
757     LoggingProgressable reporter = server == null ? null
758         : new LoggingProgressable(hri, server.getConfiguration().getLong(
759             "hbase.regionserver.split.daughter.open.log.interval", 10000));
760     daughter.openHRegion(reporter);
761   }
762 
763   static class LoggingProgressable implements CancelableProgressable {
764     private final HRegionInfo hri;
765     private long lastLog = -1;
766     private final long interval;
767 
768     LoggingProgressable(final HRegionInfo hri, final long interval) {
769       this.hri = hri;
770       this.interval = interval;
771     }
772 
773     @Override
774     public boolean progress() {
775       long now = EnvironmentEdgeManager.currentTimeMillis();
776       if (now - lastLog > this.interval) {
777         LOG.info("Opening " + this.hri.getRegionNameAsString());
778         this.lastLog = now;
779       }
780       return true;
781     }
782   }
783 
784   private void splitStoreFiles(final Map<byte[], List<StoreFile>> hstoreFilesToSplit)
785       throws IOException {
786     if (hstoreFilesToSplit == null) {
787       // Could be null because close didn't succeed -- for now consider it fatal
788       throw new IOException("Close returned empty list of StoreFiles");
789     }
790     // The following code sets up a thread pool executor with as many slots as
791     // there's files to split. It then fires up everything, waits for
792     // completion and finally checks for any exception
793     int nbFiles = hstoreFilesToSplit.size();
794     if (nbFiles == 0) {
795       // no file needs to be splitted.
796       return;
797     }
798     ThreadFactoryBuilder builder = new ThreadFactoryBuilder();
799     builder.setNameFormat("StoreFileSplitter-%1$d");
800     ThreadFactory factory = builder.build();
801     ThreadPoolExecutor threadPool =
802       (ThreadPoolExecutor) Executors.newFixedThreadPool(nbFiles, factory);
803     List<Future<Void>> futures = new ArrayList<Future<Void>>(nbFiles);
804 
805     // Split each store file.
806     for (Map.Entry<byte[], List<StoreFile>> entry: hstoreFilesToSplit.entrySet()) {
807       for (StoreFile sf: entry.getValue()) {
808         StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf);
809         futures.add(threadPool.submit(sfs));
810       }
811     }
812     // Shutdown the pool
813     threadPool.shutdown();
814 
815     // Wait for all the tasks to finish
816     try {
817       boolean stillRunning = !threadPool.awaitTermination(
818           this.fileSplitTimeout, TimeUnit.MILLISECONDS);
819       if (stillRunning) {
820         threadPool.shutdownNow();
821         // wait for the thread to shutdown completely.
822         while (!threadPool.isTerminated()) {
823           Thread.sleep(50);
824         }
825         throw new IOException("Took too long to split the" +
826             " files and create the references, aborting split");
827       }
828     } catch (InterruptedException e) {
829       throw (InterruptedIOException)new InterruptedIOException().initCause(e);
830     }
831 
832     // Look for any exception
833     for (Future<Void> future: futures) {
834       try {
835         future.get();
836       } catch (InterruptedException e) {
837         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
838       } catch (ExecutionException e) {
839         throw new IOException(e);
840       }
841     }
842   }
843 
844   private void splitStoreFile(final byte[] family, final StoreFile sf) throws IOException {
845     HRegionFileSystem fs = this.parent.getRegionFileSystem();
846     String familyName = Bytes.toString(family);
847     fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false);
848     fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true);
849   }
850 
851   /**
852    * Utility class used to do the file splitting / reference writing
853    * in parallel instead of sequentially.
854    */
855   class StoreFileSplitter implements Callable<Void> {
856     private final byte[] family;
857     private final StoreFile sf;
858 
859     /**
860      * Constructor that takes what it needs to split
861      * @param family Family that contains the store file
862      * @param sf which file
863      */
864     public StoreFileSplitter(final byte[] family, final StoreFile sf) {
865       this.sf = sf;
866       this.family = family;
867     }
868 
869     public Void call() throws IOException {
870       splitStoreFile(family, sf);
871       return null;
872     }
873   }
874 
875   /**
876    * @param server Hosting server instance (May be null when testing).
877    * @param services
878    * @throws IOException If thrown, rollback failed.  Take drastic action.
879    * @return True if we successfully rolled back, false if we got to the point
880    * of no return and so now need to abort the server to minimize damage.
881    */
882   @SuppressWarnings("deprecation")
883   public boolean rollback(final Server server, final RegionServerServices services)
884   throws IOException {
885     // Coprocessor callback
886     if (this.parent.getCoprocessorHost() != null) {
887       this.parent.getCoprocessorHost().preRollBackSplit();
888     }
889 
890     boolean result = true;
891     ListIterator<JournalEntry> iterator =
892       this.journal.listIterator(this.journal.size());
893     // Iterate in reverse.
894     while (iterator.hasPrevious()) {
895       JournalEntry je = iterator.previous();
896       switch(je.type) {
897 
898       case SET_SPLITTING_IN_ZK:
899         if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
900           cleanZK(server, this.parent.getRegionInfo());
901         } else if (services != null
902             && !useZKForAssignment
903             && !services.reportRegionStateTransition(TransitionCode.SPLIT_REVERTED,
904               parent.getRegionInfo(), hri_a, hri_b)) {
905           return false;
906         }
907         break;
908 
909       case CREATE_SPLIT_DIR:
910         this.parent.writestate.writesEnabled = true;
911         this.parent.getRegionFileSystem().cleanupSplitsDir();
912         break;
913 
914       case CLOSED_PARENT_REGION:
915         try {
916           // So, this returns a seqid but if we just closed and then reopened, we
917           // should be ok. On close, we flushed using sequenceid obtained from
918           // hosting regionserver so no need to propagate the sequenceid returned
919           // out of initialize below up into regionserver as we normally do.
920           // TODO: Verify.
921           this.parent.initialize();
922         } catch (IOException e) {
923           LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " +
924             this.parent.getRegionNameAsString(), e);
925           throw new RuntimeException(e);
926         }
927         break;
928 
929       case STARTED_REGION_A_CREATION:
930         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a);
931         break;
932 
933       case STARTED_REGION_B_CREATION:
934         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b);
935         break;
936 
937       case OFFLINED_PARENT:
938         if (services != null) services.addToOnlineRegions(this.parent);
939         break;
940 
941       case PONR:
942         // We got to the point-of-no-return so we need to just abort. Return
943         // immediately.  Do not clean up created daughter regions.  They need
944         // to be in place so we don't delete the parent region mistakenly.
945         // See HBASE-3872.
946         return false;
947 
948       // Informational only cases
949       case STARTED:
950       case PREPARED:
951       case BEFORE_PRE_SPLIT_HOOK:
952       case AFTER_PRE_SPLIT_HOOK:
953       case BEFORE_POST_SPLIT_HOOK:
954       case AFTER_POST_SPLIT_HOOK:
955       case OPENED_REGION_A:
956       case OPENED_REGION_B:
957         break;
958 
959       default:
960         throw new RuntimeException("Unhandled journal entry: " + je);
961       }
962     }
963     // Coprocessor callback
964     if (this.parent.getCoprocessorHost() != null) {
965       this.parent.getCoprocessorHost().postRollBackSplit();
966     }
967     return result;
968   }
969 
970   HRegionInfo getFirstDaughter() {
971     return hri_a;
972   }
973 
974   HRegionInfo getSecondDaughter() {
975     return hri_b;
976   }
977 
978   private static void cleanZK(final Server server, final HRegionInfo hri) {
979     try {
980       // Only delete if its in expected state; could have been hijacked.
981       if (!ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
982           RS_ZK_REQUEST_REGION_SPLIT, server.getServerName())) {
983         ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
984           RS_ZK_REGION_SPLITTING, server.getServerName());
985       }
986     } catch (KeeperException.NoNodeException e) {
987       LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
988     } catch (KeeperException e) {
989       server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
990     }
991   }
992 
993   /**
994    * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region.
995    * Create it ephemeral in case regionserver dies mid-split.
996    *
997    * <p>Does not transition nodes from other states.  If a node already exists
998    * for this region, a {@link NodeExistsException} will be thrown.
999    *
1000    * @param zkw zk reference
1001    * @param region region to be created as offline
1002    * @param serverName server event originates from
1003    * @throws KeeperException
1004    * @throws IOException
1005    */
1006   public static void createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region,
1007       final ServerName serverName, final HRegionInfo a,
1008       final HRegionInfo b) throws KeeperException, IOException {
1009     LOG.debug(zkw.prefix("Creating ephemeral node for " +
1010       region.getEncodedName() + " in PENDING_SPLIT state"));
1011     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
1012     RegionTransition rt = RegionTransition.createRegionTransition(
1013       RS_ZK_REQUEST_REGION_SPLIT, region.getRegionName(), serverName, payload);
1014     String node = ZKAssign.getNodeName(zkw, region.getEncodedName());
1015     if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) {
1016       throw new IOException("Failed create of ephemeral " + node);
1017     }
1018   }
1019 
1020   /**
1021    * Transitions an existing ephemeral node for the specified region which is
1022    * currently in the begin state to be in the end state. Master cleans up the
1023    * final SPLIT znode when it reads it (or if we crash, zk will clean it up).
1024    *
1025    * <p>Does not transition nodes from other states. If for some reason the
1026    * node could not be transitioned, the method returns -1. If the transition
1027    * is successful, the version of the node after transition is returned.
1028    *
1029    * <p>This method can fail and return false for three different reasons:
1030    * <ul><li>Node for this region does not exist</li>
1031    * <li>Node for this region is not in the begin state</li>
1032    * <li>After verifying the begin state, update fails because of wrong version
1033    * (this should never actually happen since an RS only does this transition
1034    * following a transition to the begin state. If two RS are conflicting, one would
1035    * fail the original transition to the begin state and not this transition)</li>
1036    * </ul>
1037    *
1038    * <p>Does not set any watches.
1039    *
1040    * <p>This method should only be used by a RegionServer when splitting a region.
1041    *
1042    * @param zkw zk reference
1043    * @param parent region to be transitioned to opened
1044    * @param a Daughter a of split
1045    * @param b Daughter b of split
1046    * @param serverName server event originates from
1047    * @param znodeVersion expected version of data before modification
1048    * @param beginState the expected current state the znode should be
1049    * @param endState the state to be transition to
1050    * @return version of node after transition, -1 if unsuccessful transition
1051    * @throws KeeperException if unexpected zookeeper exception
1052    * @throws IOException
1053    */
1054   public static int transitionSplittingNode(ZooKeeperWatcher zkw,
1055       HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
1056       final int znodeVersion, final EventType beginState,
1057       final EventType endState) throws KeeperException, IOException {
1058     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
1059     return ZKAssign.transitionNode(zkw, parent, serverName,
1060       beginState, endState, znodeVersion, payload);
1061   }
1062 
1063   List<JournalEntry> getJournal() {
1064     return journal;
1065   }
1066 }