View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
22  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
23  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
24  
25  import java.io.IOException;
26  import java.io.InterruptedIOException;
27  import java.util.ArrayList;
28  import java.util.List;
29  import java.util.ListIterator;
30  import java.util.Map;
31  import java.util.concurrent.Callable;
32  import java.util.concurrent.ExecutionException;
33  import java.util.concurrent.Executors;
34  import java.util.concurrent.Future;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.ThreadPoolExecutor;
37  import java.util.concurrent.TimeUnit;
38  
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.apache.hadoop.fs.Path;
42  import org.apache.hadoop.hbase.classification.InterfaceAudience;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.HRegionInfo;
45  import org.apache.hadoop.hbase.RegionTransition;
46  import org.apache.hadoop.hbase.Server;
47  import org.apache.hadoop.hbase.ServerName;
48  import org.apache.hadoop.hbase.catalog.CatalogTracker;
49  import org.apache.hadoop.hbase.catalog.MetaEditor;
50  import org.apache.hadoop.hbase.client.Mutation;
51  import org.apache.hadoop.hbase.client.Put;
52  import org.apache.hadoop.hbase.executor.EventType;
53  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
54  import org.apache.hadoop.hbase.util.Bytes;
55  import org.apache.hadoop.hbase.util.CancelableProgressable;
56  import org.apache.hadoop.hbase.util.ConfigUtil;
57  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
58  import org.apache.hadoop.hbase.util.FSUtils;
59  import org.apache.hadoop.hbase.util.HasThread;
60  import org.apache.hadoop.hbase.util.Pair;
61  import org.apache.hadoop.hbase.util.PairOfSameType;
62  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
63  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
64  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
65  import org.apache.zookeeper.KeeperException;
66  import org.apache.zookeeper.KeeperException.NodeExistsException;
67  import org.apache.zookeeper.data.Stat;
68  
69  import com.google.common.util.concurrent.ThreadFactoryBuilder;
70  
71  /**
72   * Executes region split as a "transaction".  Call {@link #prepare()} to setup
73   * the transaction, {@link #execute(Server, RegionServerServices)} to run the
74   * transaction and {@link #rollback(Server, RegionServerServices)} to cleanup if execute fails.
75   *
76   * <p>Here is an example of how you would use this class:
77   * <pre>
78   *  SplitTransaction st = new SplitTransaction(this.conf, parent, midKey)
79   *  if (!st.prepare()) return;
80   *  try {
81   *    st.execute(server, services);
82   *  } catch (IOException ioe) {
83   *    try {
84   *      st.rollback(server, services);
85   *      return;
86   *    } catch (RuntimeException e) {
87   *      myAbortable.abort("Failed split, abort");
88   *    }
89   *  }
90   * </Pre>
91   * <p>This class is not thread safe.  Caller needs ensure split is run by
92   * one thread only.
93   */
94  @InterfaceAudience.Private
95  public class SplitTransaction {
96    private static final Log LOG = LogFactory.getLog(SplitTransaction.class);
97  
98    /*
99     * Region to split
100    */
101   private final HRegion parent;
102   private HRegionInfo hri_a;
103   private HRegionInfo hri_b;
104   private long fileSplitTimeout = 30000;
105   private int znodeVersion = -1;
106   boolean useZKForAssignment;
107 
108   /*
109    * Row to split around
110    */
111   private final byte [] splitrow;
112 
113   /**
114    * Types to add to the transaction journal.
115    * Each enum is a step in the split transaction. Used to figure how much
116    * we need to rollback.
117    */
118   static enum JournalEntryType {
119     /**
120      * Started
121      */
122     STARTED,
123     /**
124      * Prepared (after table lock)
125      */
126     PREPARED,
127     /**
128      * Before preSplit coprocessor hook
129      */
130     BEFORE_PRE_SPLIT_HOOK,
131     /**
132      * After preSplit coprocessor hook
133      */
134     AFTER_PRE_SPLIT_HOOK,
135     /**
136      * Set region as in transition, set it into SPLITTING state.
137      */
138     SET_SPLITTING_IN_ZK,
139     /**
140      * We created the temporary split data directory.
141      */
142     CREATE_SPLIT_DIR,
143     /**
144      * Closed the parent region.
145      */
146     CLOSED_PARENT_REGION,
147     /**
148      * The parent has been taken out of the server's online regions list.
149      */
150     OFFLINED_PARENT,
151     /**
152      * Started in on creation of the first daughter region.
153      */
154     STARTED_REGION_A_CREATION,
155     /**
156      * Started in on the creation of the second daughter region.
157      */
158     STARTED_REGION_B_CREATION,
159     /**
160      * Opened the first daughter region
161      */
162     OPENED_REGION_A,
163     /**
164      * Opened the second daughter region
165      */
166     OPENED_REGION_B,
167     /**
168      * Before postSplit coprocessor hook
169      */
170     BEFORE_POST_SPLIT_HOOK,
171     /**
172      * After postSplit coprocessor hook
173      */
174     AFTER_POST_SPLIT_HOOK,
175     /**
176      * Point of no return.
177      * If we got here, then transaction is not recoverable other than by
178      * crashing out the regionserver.
179      */
180     PONR
181   }
182 
183   static class JournalEntry {
184     private JournalEntryType type;
185     private long timestamp;
186 
187     public JournalEntry(JournalEntryType type) {
188       this(type, EnvironmentEdgeManager.currentTimeMillis());
189     }
190 
191     public JournalEntry(JournalEntryType type, long timestamp) {
192       this.type = type;
193       this.timestamp = timestamp;
194     }
195 
196     @Override
197     public String toString() {
198       StringBuilder sb = new StringBuilder();
199       sb.append(type);
200       sb.append(" at ");
201       sb.append(timestamp);
202       return sb.toString();
203     }
204   }
205 
206   /*
207    * Journal of how far the split transaction has progressed.
208    */
209   private final List<JournalEntry> journal = new ArrayList<JournalEntry>();
210 
211   /**
212    * Constructor
213    * @param r Region to split
214    * @param splitrow Row to split around
215    */
216   public SplitTransaction(final HRegion r, final byte [] splitrow) {
217     this.parent = r;
218     this.splitrow = splitrow;
219     this.journal.add(new JournalEntry(JournalEntryType.STARTED));
220   }
221 
222   /**
223    * Does checks on split inputs.
224    * @return <code>true</code> if the region is splittable else
225    * <code>false</code> if it is not (e.g. its already closed, etc.).
226    */
227   public boolean prepare() {
228     if (!this.parent.isSplittable()) return false;
229     // Split key can be null if this region is unsplittable; i.e. has refs.
230     if (this.splitrow == null) return false;
231     HRegionInfo hri = this.parent.getRegionInfo();
232     parent.prepareToSplit();
233     // Check splitrow.
234     byte [] startKey = hri.getStartKey();
235     byte [] endKey = hri.getEndKey();
236     if (Bytes.equals(startKey, splitrow) ||
237         !this.parent.getRegionInfo().containsRow(splitrow)) {
238       LOG.info("Split row is not inside region key range or is equal to " +
239           "startkey: " + Bytes.toStringBinary(this.splitrow));
240       return false;
241     }
242     long rid = getDaughterRegionIdTimestamp(hri);
243     this.hri_a = new HRegionInfo(hri.getTable(), startKey, this.splitrow, false, rid);
244     this.hri_b = new HRegionInfo(hri.getTable(), this.splitrow, endKey, false, rid);
245     this.journal.add(new JournalEntry(JournalEntryType.PREPARED));
246     return true;
247   }
248 
249   /**
250    * Calculate daughter regionid to use.
251    * @param hri Parent {@link HRegionInfo}
252    * @return Daughter region id (timestamp) to use.
253    */
254   private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) {
255     long rid = EnvironmentEdgeManager.currentTimeMillis();
256     // Regionid is timestamp.  Can't be less than that of parent else will insert
257     // at wrong location in hbase:meta (See HBASE-710).
258     if (rid < hri.getRegionId()) {
259       LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() +
260         " but current time here is " + rid);
261       rid = hri.getRegionId() + 1;
262     }
263     return rid;
264   }
265 
266   private static IOException closedByOtherException = new IOException(
267       "Failed to close region: already closed by another thread");
268 
269   /**
270    * Prepare the regions and region files.
271    * @param server Hosting server instance.  Can be null when testing (won't try
272    * and update in zk if a null server)
273    * @param services Used to online/offline regions.
274    * @throws IOException If thrown, transaction failed.
275    *    Call {@link #rollback(Server, RegionServerServices)}
276    * @return Regions created
277    */
278   /* package */PairOfSameType<HRegion> createDaughters(final Server server,
279       final RegionServerServices services) throws IOException {
280     LOG.info("Starting split of region " + this.parent);
281     if ((server != null && server.isStopped()) ||
282         (services != null && services.isStopping())) {
283       throw new IOException("Server is stopped or stopping");
284     }
285     assert !this.parent.lock.writeLock().isHeldByCurrentThread():
286       "Unsafe to hold write lock while performing RPCs";
287 
288     journal.add(new JournalEntry(JournalEntryType.BEFORE_PRE_SPLIT_HOOK));
289 
290     // Coprocessor callback
291     if (this.parent.getCoprocessorHost() != null) {
292       // TODO: Remove one of these
293       this.parent.getCoprocessorHost().preSplit();
294       this.parent.getCoprocessorHost().preSplit(this.splitrow);
295     }
296 
297     journal.add(new JournalEntry(JournalEntryType.AFTER_PRE_SPLIT_HOOK));
298 
299     // If true, no cluster to write meta edits to or to update znodes in.
300     boolean testing = server == null? true:
301         server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
302     this.fileSplitTimeout = testing ? this.fileSplitTimeout :
303         server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
304           this.fileSplitTimeout);
305 
306     PairOfSameType<HRegion> daughterRegions = stepsBeforePONR(server, services, testing);
307 
308     List<Mutation> metaEntries = new ArrayList<Mutation>();
309     if (this.parent.getCoprocessorHost() != null) {
310       if (this.parent.getCoprocessorHost().
311           preSplitBeforePONR(this.splitrow, metaEntries)) {
312         throw new IOException("Coprocessor bypassing region "
313             + this.parent.getRegionNameAsString() + " split.");
314       }
315       try {
316         for (Mutation p : metaEntries) {
317           HRegionInfo.parseRegionName(p.getRow());
318         }
319       } catch (IOException e) {
320         LOG.error("Row key of mutation from coprossor is not parsable as region name."
321             + "Mutations from coprocessor should only for hbase:meta table.");
322         throw e;
323       }
324     }
325 
326     // This is the point of no return.  Adding subsequent edits to .META. as we
327     // do below when we do the daughter opens adding each to .META. can fail in
328     // various interesting ways the most interesting of which is a timeout
329     // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR
330     // then subsequent failures need to crash out this regionserver; the
331     // server shutdown processing should be able to fix-up the incomplete split.
332     // The offlined parent will have the daughters as extra columns.  If
333     // we leave the daughter regions in place and do not remove them when we
334     // crash out, then they will have their references to the parent in place
335     // still and the server shutdown fixup of .META. will point to these
336     // regions.
337     // We should add PONR JournalEntry before offlineParentInMeta,so even if
338     // OfflineParentInMeta timeout,this will cause regionserver exit,and then
339     // master ServerShutdownHandler will fix daughter & avoid data loss. (See
340     // HBase-4562).
341     this.journal.add(new JournalEntry(JournalEntryType.PONR));
342 
343     // Edit parent in meta.  Offlines parent region and adds splita and splitb
344     // as an atomic update. See HBASE-7721. This update to META makes the region
345     // will determine whether the region is split or not in case of failures.
346     // If it is successful, master will roll-forward, if not, master will rollback
347     // and assign the parent region.
348     if (!testing && useZKForAssignment) {
349       if (metaEntries == null || metaEntries.isEmpty()) {
350         MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(), daughterRegions
351             .getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(), server
352             .getServerName());
353       } else {
354         offlineParentInMetaAndputMetaEntries(server.getCatalogTracker(), parent.getRegionInfo(),
355           daughterRegions.getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(),
356           server.getServerName(), metaEntries);
357       }
358     } else if (services != null && !useZKForAssignment) {
359       if (!services.reportRegionStateTransition(TransitionCode.SPLIT_PONR, parent.getRegionInfo(),
360         hri_a, hri_b)) {
361         // Passed PONR, let SSH clean it up
362         throw new IOException("Failed to notify master that split passed PONR: "
363             + parent.getRegionInfo().getRegionNameAsString());
364       }
365     }
366     return daughterRegions;
367   }
368 
369   public PairOfSameType<HRegion> stepsBeforePONR(final Server server,
370       final RegionServerServices services, boolean testing) throws IOException {
371     // Set ephemeral SPLITTING znode up in zk.  Mocked servers sometimes don't
372     // have zookeeper so don't do zk stuff if server or zookeeper is null
373     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
374       try {
375         createNodeSplitting(server.getZooKeeper(),
376           parent.getRegionInfo(), server.getServerName(), hri_a, hri_b);
377       } catch (KeeperException e) {
378         throw new IOException("Failed creating PENDING_SPLIT znode on " +
379           this.parent.getRegionNameAsString(), e);
380       }
381     } else if (services != null && !useZKForAssignment) {
382       if (!services.reportRegionStateTransition(TransitionCode.READY_TO_SPLIT,
383         parent.getRegionInfo(), hri_a, hri_b)) {
384         throw new IOException("Failed to get ok from master to split "
385             + parent.getRegionNameAsString());
386       }
387     }
388     this.journal.add(new JournalEntry(JournalEntryType.SET_SPLITTING_IN_ZK));
389     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
390       // After creating the split node, wait for master to transition it
391       // from PENDING_SPLIT to SPLITTING so that we can move on. We want master
392       // knows about it and won't transition any region which is splitting.
393       znodeVersion = getZKNode(server, services);
394     }
395 
396     this.parent.getRegionFileSystem().createSplitsDir();
397     this.journal.add(new JournalEntry(JournalEntryType.CREATE_SPLIT_DIR));
398 
399     Map<byte[], List<StoreFile>> hstoreFilesToSplit = null;
400     Exception exceptionToThrow = null;
401     try{
402       hstoreFilesToSplit = this.parent.close(false);
403     } catch (Exception e) {
404       exceptionToThrow = e;
405     }
406     if (exceptionToThrow == null && hstoreFilesToSplit == null) {
407       // The region was closed by a concurrent thread.  We can't continue
408       // with the split, instead we must just abandon the split.  If we
409       // reopen or split this could cause problems because the region has
410       // probably already been moved to a different server, or is in the
411       // process of moving to a different server.
412       exceptionToThrow = closedByOtherException;
413     }
414     if (exceptionToThrow != closedByOtherException) {
415       this.journal.add(new JournalEntry(JournalEntryType.CLOSED_PARENT_REGION));
416     }
417     if (exceptionToThrow != null) {
418       if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
419       throw new IOException(exceptionToThrow);
420     }
421     if (!testing) {
422       services.removeFromOnlineRegions(this.parent, null);
423     }
424     this.journal.add(new JournalEntry(JournalEntryType.OFFLINED_PARENT));
425 
426     // TODO: If splitStoreFiles were multithreaded would we complete steps in
427     // less elapsed time?  St.Ack 20100920
428     //
429     // splitStoreFiles creates daughter region dirs under the parent splits dir
430     // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will
431     // clean this up.
432     Pair<Integer, Integer> expectedReferences = splitStoreFiles(hstoreFilesToSplit);
433 
434     // Log to the journal that we are creating region A, the first daughter
435     // region.  We could fail halfway through.  If we do, we could have left
436     // stuff in fs that needs cleanup -- a storefile or two.  Thats why we
437     // add entry to journal BEFORE rather than AFTER the change.
438     this.journal.add(new JournalEntry(JournalEntryType.STARTED_REGION_A_CREATION));
439     assertReferenceFileCount(expectedReferences.getFirst(),
440         this.parent.getRegionFileSystem().getSplitsDir(this.hri_a));
441     HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a);
442     assertReferenceFileCount(expectedReferences.getFirst(),
443         new Path(this.parent.getRegionFileSystem().getTableDir(), this.hri_a.getEncodedName()));
444 
445     // Ditto
446     this.journal.add(new JournalEntry(JournalEntryType.STARTED_REGION_B_CREATION));
447     assertReferenceFileCount(expectedReferences.getSecond(),
448         this.parent.getRegionFileSystem().getSplitsDir(this.hri_b));
449     HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b);
450     assertReferenceFileCount(expectedReferences.getSecond(),
451         new Path(this.parent.getRegionFileSystem().getTableDir(), this.hri_b.getEncodedName()));
452 
453     return new PairOfSameType<HRegion>(a, b);
454   }
455 
456   void assertReferenceFileCount(int expectedReferenceFileCount, Path dir)
457       throws IOException {
458     if (expectedReferenceFileCount != 0 &&
459         expectedReferenceFileCount != FSUtils.getRegionReferenceFileCount(this.parent.getFilesystem(), dir)) {
460       throw new IOException("Failing split. Expected reference file count isn't equal.");
461     }
462   }
463 
464   /**
465    * Perform time consuming opening of the daughter regions.
466    * @param server Hosting server instance.  Can be null when testing (won't try
467    * and update in zk if a null server)
468    * @param services Used to online/offline regions.
469    * @param a first daughter region
470    * @param a second daughter region
471    * @throws IOException If thrown, transaction failed.
472    *          Call {@link #rollback(Server, RegionServerServices)}
473    */
474   /* package */void openDaughters(final Server server,
475       final RegionServerServices services, HRegion a, HRegion b)
476       throws IOException {
477     boolean stopped = server != null && server.isStopped();
478     boolean stopping = services != null && services.isStopping();
479     // TODO: Is this check needed here?
480     if (stopped || stopping) {
481       LOG.info("Not opening daughters " +
482           b.getRegionInfo().getRegionNameAsString() +
483           " and " +
484           a.getRegionInfo().getRegionNameAsString() +
485           " because stopping=" + stopping + ", stopped=" + stopped);
486     } else {
487       // Open daughters in parallel.
488       DaughterOpener aOpener = new DaughterOpener(server, a);
489       DaughterOpener bOpener = new DaughterOpener(server, b);
490       aOpener.start();
491       bOpener.start();
492       try {
493         aOpener.join();
494         if (aOpener.getException() == null) {
495           journal.add(new JournalEntry(JournalEntryType.OPENED_REGION_A));
496         }
497         bOpener.join();
498         if (bOpener.getException() == null) {
499           journal.add(new JournalEntry(JournalEntryType.OPENED_REGION_B));
500         }
501       } catch (InterruptedException e) {
502         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
503       }
504       if (aOpener.getException() != null) {
505         throw new IOException("Failed " +
506           aOpener.getName(), aOpener.getException());
507       }
508       if (bOpener.getException() != null) {
509         throw new IOException("Failed " +
510           bOpener.getName(), bOpener.getException());
511       }
512       if (services != null) {
513         try {
514           if (useZKForAssignment) {
515             // add 2nd daughter first (see HBASE-4335)
516             services.postOpenDeployTasks(b, server.getCatalogTracker());
517           } else if (!services.reportRegionStateTransition(TransitionCode.SPLIT,
518               parent.getRegionInfo(), hri_a, hri_b)) {
519             throw new IOException("Failed to report split region to master: "
520               + parent.getRegionInfo().getShortNameToLog());
521           }
522           // Should add it to OnlineRegions
523           services.addToOnlineRegions(b);
524           if (useZKForAssignment) {
525             services.postOpenDeployTasks(a, server.getCatalogTracker());
526           }
527           services.addToOnlineRegions(a);
528         } catch (KeeperException ke) {
529           throw new IOException(ke);
530         }
531       }
532     }
533   }
534 
535   /**
536    * Finish off split transaction, transition the zknode
537    * @param server Hosting server instance.  Can be null when testing (won't try
538    * and update in zk if a null server)
539    * @param services Used to online/offline regions.
540    * @param a first daughter region
541    * @param a second daughter region
542    * @throws IOException If thrown, transaction failed.
543    *          Call {@link #rollback(Server, RegionServerServices)}
544    */
545   /* package */void transitionZKNode(final Server server,
546       final RegionServerServices services, HRegion a, HRegion b)
547       throws IOException {
548     // Tell master about split by updating zk.  If we fail, abort.
549     if (server != null && server.getZooKeeper() != null) {
550       try {
551         this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
552           parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
553           server.getServerName(), this.znodeVersion,
554           RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
555 
556         int spins = 0;
557         // Now wait for the master to process the split. We know it's done
558         // when the znode is deleted. The reason we keep tickling the znode is
559         // that it's possible for the master to miss an event.
560         do {
561           if (spins % 10 == 0) {
562             LOG.debug("Still waiting on the master to process the split for " +
563                 this.parent.getRegionInfo().getEncodedName());
564           }
565           Thread.sleep(100);
566           // When this returns -1 it means the znode doesn't exist
567           this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
568             parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
569             server.getServerName(), this.znodeVersion,
570             RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT);
571           spins++;
572         } while (this.znodeVersion != -1 && !server.isStopped()
573             && !services.isStopping());
574       } catch (Exception e) {
575         if (e instanceof InterruptedException) {
576           Thread.currentThread().interrupt();
577         }
578         throw new IOException("Failed telling master about split", e);
579       }
580     }
581 
582     
583 
584     // Leaving here, the splitdir with its dross will be in place but since the
585     // split was successful, just leave it; it'll be cleaned when parent is
586     // deleted and cleaned up.
587   }
588 
589   /**
590    * Wait for the splitting node to be transitioned from pending_split
591    * to splitting by master. That's how we are sure master has processed
592    * the event and is good with us to move on. If we don't get any update,
593    * we periodically transition the node so that master gets the callback.
594    * If the node is removed or is not in pending_split state any more,
595    * we abort the split.
596    */
597   private int getZKNode(final Server server,
598       final RegionServerServices services) throws IOException {
599     // Wait for the master to process the pending_split.
600     try {
601       int spins = 0;
602       Stat stat = new Stat();
603       ZooKeeperWatcher zkw = server.getZooKeeper();
604       ServerName expectedServer = server.getServerName();
605       String node = parent.getRegionInfo().getEncodedName();
606       while (!(server.isStopped() || services.isStopping())) {
607         if (spins % 5 == 0) {
608           LOG.debug("Still waiting for master to process "
609             + "the pending_split for " + node);
610           transitionSplittingNode(zkw, parent.getRegionInfo(),
611             hri_a, hri_b, expectedServer, -1, RS_ZK_REQUEST_REGION_SPLIT,
612             RS_ZK_REQUEST_REGION_SPLIT);
613         }
614         Thread.sleep(100);
615         spins++;
616         byte [] data = ZKAssign.getDataNoWatch(zkw, node, stat);
617         if (data == null) {
618           throw new IOException("Data is null, splitting node "
619             + node + " no longer exists");
620         }
621         RegionTransition rt = RegionTransition.parseFrom(data);
622         EventType et = rt.getEventType();
623         if (et == RS_ZK_REGION_SPLITTING) {
624           ServerName serverName = rt.getServerName();
625           if (!serverName.equals(expectedServer)) {
626             throw new IOException("Splitting node " + node + " is for "
627               + serverName + ", not us " + expectedServer);
628           }
629           byte [] payloadOfSplitting = rt.getPayload();
630           List<HRegionInfo> splittingRegions = HRegionInfo.parseDelimitedFrom(
631             payloadOfSplitting, 0, payloadOfSplitting.length);
632           assert splittingRegions.size() == 2;
633           HRegionInfo a = splittingRegions.get(0);
634           HRegionInfo b = splittingRegions.get(1);
635           if (!(hri_a.equals(a) && hri_b.equals(b))) {
636             throw new IOException("Splitting node " + node + " is for " + a + ", "
637               + b + ", not expected daughters: " + hri_a + ", " + hri_b);
638           }
639           // Master has processed it.
640           return stat.getVersion();
641         }
642         if (et != RS_ZK_REQUEST_REGION_SPLIT) {
643           throw new IOException("Splitting node " + node
644             + " moved out of splitting to " + et);
645         }
646       }
647       // Server is stopping/stopped
648       throw new IOException("Server is "
649         + (services.isStopping() ? "stopping" : "stopped"));
650     } catch (Exception e) {
651       if (e instanceof InterruptedException) {
652         Thread.currentThread().interrupt();
653       }
654       throw new IOException("Failed getting SPLITTING znode on "
655         + parent.getRegionNameAsString(), e);
656     }
657   }
658 
659   /**
660    * Run the transaction.
661    * @param server Hosting server instance.  Can be null when testing (won't try
662    * and update in zk if a null server)
663    * @param services Used to online/offline regions.
664    * @throws IOException If thrown, transaction failed.
665    *          Call {@link #rollback(Server, RegionServerServices)}
666    * @return Regions created
667    * @throws IOException
668    * @see #rollback(Server, RegionServerServices)
669    */
670   public PairOfSameType<HRegion> execute(final Server server,
671       final RegionServerServices services)
672   throws IOException {
673     useZKForAssignment =
674         server == null ? true : ConfigUtil.useZKForAssignment(server.getConfiguration());
675     PairOfSameType<HRegion> regions = createDaughters(server, services);
676     if (this.parent.getCoprocessorHost() != null) {
677       this.parent.getCoprocessorHost().preSplitAfterPONR();
678     }
679     return stepsAfterPONR(server, services, regions);
680   }
681 
682   public PairOfSameType<HRegion> stepsAfterPONR(final Server server,
683       final RegionServerServices services, PairOfSameType<HRegion> regions)
684       throws IOException {
685     openDaughters(server, services, regions.getFirst(), regions.getSecond());
686     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
687       transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
688     }
689     journal.add(new JournalEntry(JournalEntryType.BEFORE_POST_SPLIT_HOOK));
690     // Coprocessor callback
691     if (this.parent.getCoprocessorHost() != null) {
692       this.parent.getCoprocessorHost().postSplit(regions.getFirst(), regions.getSecond());
693     }
694     journal.add(new JournalEntry(JournalEntryType.AFTER_POST_SPLIT_HOOK));
695     return regions;
696   }
697 
698   private void offlineParentInMetaAndputMetaEntries(CatalogTracker catalogTracker,
699       HRegionInfo parent, HRegionInfo splitA, HRegionInfo splitB,
700       ServerName serverName, List<Mutation> metaEntries) throws IOException {
701     List<Mutation> mutations = metaEntries;
702     HRegionInfo copyOfParent = new HRegionInfo(parent);
703     copyOfParent.setOffline(true);
704     copyOfParent.setSplit(true);
705 
706     //Put for parent
707     Put putParent = MetaEditor.makePutFromRegionInfo(copyOfParent);
708     MetaEditor.addDaughtersToPut(putParent, splitA, splitB);
709     mutations.add(putParent);
710     
711     //Puts for daughters
712     Put putA = MetaEditor.makePutFromRegionInfo(splitA);
713     Put putB = MetaEditor.makePutFromRegionInfo(splitB);
714 
715     addLocation(putA, serverName, 1); //these are new regions, openSeqNum = 1 is fine.
716     addLocation(putB, serverName, 1);
717     mutations.add(putA);
718     mutations.add(putB);
719     MetaEditor.mutateMetaTable(catalogTracker, mutations);
720   }
721 
722   public Put addLocation(final Put p, final ServerName sn, long openSeqNum) {
723     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
724       Bytes.toBytes(sn.getHostAndPort()));
725     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
726       Bytes.toBytes(sn.getStartcode()));
727     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SEQNUM_QUALIFIER,
728         Bytes.toBytes(openSeqNum));
729     return p;
730   }
731 
732   /*
733    * Open daughter region in its own thread.
734    * If we fail, abort this hosting server.
735    */
736   class DaughterOpener extends HasThread {
737     private final Server server;
738     private final HRegion r;
739     private Throwable t = null;
740 
741     DaughterOpener(final Server s, final HRegion r) {
742       super((s == null? "null-services": s.getServerName()) +
743         "-daughterOpener=" + r.getRegionInfo().getEncodedName());
744       setDaemon(true);
745       this.server = s;
746       this.r = r;
747     }
748 
749     /**
750      * @return Null if open succeeded else exception that causes us fail open.
751      * Call it after this thread exits else you may get wrong view on result.
752      */
753     Throwable getException() {
754       return this.t;
755     }
756 
757     @Override
758     public void run() {
759       try {
760         openDaughterRegion(this.server, r);
761       } catch (Throwable t) {
762         this.t = t;
763       }
764     }
765   }
766 
767   /**
768    * Open daughter regions, add them to online list and update meta.
769    * @param server
770    * @param daughter
771    * @throws IOException
772    * @throws KeeperException
773    */
774   void openDaughterRegion(final Server server, final HRegion daughter)
775   throws IOException, KeeperException {
776     HRegionInfo hri = daughter.getRegionInfo();
777     LoggingProgressable reporter = server == null ? null
778         : new LoggingProgressable(hri, server.getConfiguration().getLong(
779             "hbase.regionserver.split.daughter.open.log.interval", 10000));
780     daughter.openHRegion(reporter);
781   }
782 
783   static class LoggingProgressable implements CancelableProgressable {
784     private final HRegionInfo hri;
785     private long lastLog = -1;
786     private final long interval;
787 
788     LoggingProgressable(final HRegionInfo hri, final long interval) {
789       this.hri = hri;
790       this.interval = interval;
791     }
792 
793     @Override
794     public boolean progress() {
795       long now = EnvironmentEdgeManager.currentTimeMillis();
796       if (now - lastLog > this.interval) {
797         LOG.info("Opening " + this.hri.getRegionNameAsString());
798         this.lastLog = now;
799       }
800       return true;
801     }
802   }
803 
804 
805   /**
806    * Creates reference files for top and bottom half of the
807    * @param hstoreFilesToSplit map of store files to create half file references for.
808    * @return the number of reference files that were created.
809    * @throws IOException
810    */
811   private Pair<Integer, Integer> splitStoreFiles(
812       final Map<byte[], List<StoreFile>> hstoreFilesToSplit)
813       throws IOException {
814     if (hstoreFilesToSplit == null) {
815       // Could be null because close didn't succeed -- for now consider it fatal
816       throw new IOException("Close returned empty list of StoreFiles");
817     }
818     // The following code sets up a thread pool executor with as many slots as
819     // there's files to split. It then fires up everything, waits for
820     // completion and finally checks for any exception
821     int nbFiles = hstoreFilesToSplit.size();
822     if (nbFiles == 0) {
823       // no file needs to be splitted.
824       return new Pair<Integer, Integer>(0,0);
825     }
826     ThreadFactoryBuilder builder = new ThreadFactoryBuilder();
827     builder.setNameFormat("StoreFileSplitter-%1$d");
828     ThreadFactory factory = builder.build();
829     ThreadPoolExecutor threadPool =
830       (ThreadPoolExecutor) Executors.newFixedThreadPool(nbFiles, factory);
831     List<Future<Pair<Path,Path>>> futures = new ArrayList<Future<Pair<Path,Path>>> (nbFiles);
832 
833     // Split each store file.
834     for (Map.Entry<byte[], List<StoreFile>> entry: hstoreFilesToSplit.entrySet()) {
835       for (StoreFile sf: entry.getValue()) {
836         StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf);
837         futures.add(threadPool.submit(sfs));
838       }
839     }
840     // Shutdown the pool
841     threadPool.shutdown();
842 
843     // Wait for all the tasks to finish
844     try {
845       boolean stillRunning = !threadPool.awaitTermination(
846           this.fileSplitTimeout, TimeUnit.MILLISECONDS);
847       if (stillRunning) {
848         threadPool.shutdownNow();
849         // wait for the thread to shutdown completely.
850         while (!threadPool.isTerminated()) {
851           Thread.sleep(50);
852         }
853         throw new IOException("Took too long to split the" +
854             " files and create the references, aborting split");
855       }
856     } catch (InterruptedException e) {
857       throw (InterruptedIOException)new InterruptedIOException().initCause(e);
858     }
859 
860     int created_a = 0;
861     int created_b = 0;
862     // Look for any exception
863     for (Future<Pair<Path, Path>> future : futures) {
864       try {
865         Pair<Path, Path> p = future.get();
866         created_a += p.getFirst() != null ? 1 : 0;
867         created_b += p.getSecond() != null ? 1 : 0;
868       } catch (InterruptedException e) {
869         throw (InterruptedIOException) new InterruptedIOException().initCause(e);
870       } catch (ExecutionException e) {
871         throw new IOException(e);
872       }
873     }
874 
875     return new Pair<Integer, Integer>(created_a, created_b);
876   }
877 
878   private Pair<Path, Path> splitStoreFile(final byte[] family, final StoreFile sf) throws IOException {
879     HRegionFileSystem fs = this.parent.getRegionFileSystem();
880     String familyName = Bytes.toString(family);
881     Path path_a =
882         fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false,
883           this.parent.getSplitPolicy());
884     Path path_b =
885         fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true,
886           this.parent.getSplitPolicy());
887     return new Pair<Path,Path>(path_a, path_b);
888   }
889 
890   /**
891    * Utility class used to do the file splitting / reference writing
892    * in parallel instead of sequentially.
893    */
894   class StoreFileSplitter implements Callable<Pair<Path,Path>> {
895     private final byte[] family;
896     private final StoreFile sf;
897 
898     /**
899      * Constructor that takes what it needs to split
900      * @param family Family that contains the store file
901      * @param sf which file
902      */
903     public StoreFileSplitter(final byte[] family, final StoreFile sf) {
904       this.sf = sf;
905       this.family = family;
906     }
907 
908     public Pair<Path,Path> call() throws IOException {
909       return splitStoreFile(family, sf);
910     }
911   }
912 
913   /**
914    * @param server Hosting server instance (May be null when testing).
915    * @param services
916    * @throws IOException If thrown, rollback failed.  Take drastic action.
917    * @return True if we successfully rolled back, false if we got to the point
918    * of no return and so now need to abort the server to minimize damage.
919    */
920   @SuppressWarnings("deprecation")
921   public boolean rollback(final Server server, final RegionServerServices services)
922   throws IOException {
923     // Coprocessor callback
924     if (this.parent.getCoprocessorHost() != null) {
925       this.parent.getCoprocessorHost().preRollBackSplit();
926     }
927 
928     boolean result = true;
929     ListIterator<JournalEntry> iterator =
930       this.journal.listIterator(this.journal.size());
931     // Iterate in reverse.
932     while (iterator.hasPrevious()) {
933       JournalEntry je = iterator.previous();
934       switch(je.type) {
935 
936       case SET_SPLITTING_IN_ZK:
937         if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
938           cleanZK(server, this.parent.getRegionInfo());
939         } else if (services != null
940             && !useZKForAssignment
941             && !services.reportRegionStateTransition(TransitionCode.SPLIT_REVERTED,
942               parent.getRegionInfo(), hri_a, hri_b)) {
943           return false;
944         }
945         break;
946 
947       case CREATE_SPLIT_DIR:
948         this.parent.writestate.writesEnabled = true;
949         this.parent.getRegionFileSystem().cleanupSplitsDir();
950         break;
951 
952       case CLOSED_PARENT_REGION:
953         try {
954           // So, this returns a seqid but if we just closed and then reopened, we
955           // should be ok. On close, we flushed using sequenceid obtained from
956           // hosting regionserver so no need to propagate the sequenceid returned
957           // out of initialize below up into regionserver as we normally do.
958           // TODO: Verify.
959           this.parent.initialize();
960         } catch (IOException e) {
961           LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " +
962             this.parent.getRegionNameAsString(), e);
963           throw new RuntimeException(e);
964         }
965         break;
966 
967       case STARTED_REGION_A_CREATION:
968         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a);
969         break;
970 
971       case STARTED_REGION_B_CREATION:
972         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b);
973         break;
974 
975       case OFFLINED_PARENT:
976         if (services != null) services.addToOnlineRegions(this.parent);
977         break;
978 
979       case PONR:
980         // We got to the point-of-no-return so we need to just abort. Return
981         // immediately.  Do not clean up created daughter regions.  They need
982         // to be in place so we don't delete the parent region mistakenly.
983         // See HBASE-3872.
984         return false;
985 
986       // Informational only cases
987       case STARTED:
988       case PREPARED:
989       case BEFORE_PRE_SPLIT_HOOK:
990       case AFTER_PRE_SPLIT_HOOK:
991       case BEFORE_POST_SPLIT_HOOK:
992       case AFTER_POST_SPLIT_HOOK:
993       case OPENED_REGION_A:
994       case OPENED_REGION_B:
995         break;
996 
997       default:
998         throw new RuntimeException("Unhandled journal entry: " + je);
999       }
1000     }
1001     // Coprocessor callback
1002     if (this.parent.getCoprocessorHost() != null) {
1003       this.parent.getCoprocessorHost().postRollBackSplit();
1004     }
1005     return result;
1006   }
1007 
1008   HRegionInfo getFirstDaughter() {
1009     return hri_a;
1010   }
1011 
1012   HRegionInfo getSecondDaughter() {
1013     return hri_b;
1014   }
1015 
1016   private static void cleanZK(final Server server, final HRegionInfo hri) {
1017     try {
1018       // Only delete if its in expected state; could have been hijacked.
1019       if (!ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
1020           RS_ZK_REQUEST_REGION_SPLIT, server.getServerName())) {
1021         ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
1022           RS_ZK_REGION_SPLITTING, server.getServerName());
1023       }
1024     } catch (KeeperException.NoNodeException e) {
1025       LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
1026     } catch (KeeperException e) {
1027       server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
1028     }
1029   }
1030 
1031   /**
1032    * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region.
1033    * Create it ephemeral in case regionserver dies mid-split.
1034    *
1035    * <p>Does not transition nodes from other states.  If a node already exists
1036    * for this region, a {@link NodeExistsException} will be thrown.
1037    *
1038    * @param zkw zk reference
1039    * @param region region to be created as offline
1040    * @param serverName server event originates from
1041    * @throws KeeperException
1042    * @throws IOException
1043    */
1044   public static void createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region,
1045       final ServerName serverName, final HRegionInfo a,
1046       final HRegionInfo b) throws KeeperException, IOException {
1047     LOG.debug(zkw.prefix("Creating ephemeral node for " +
1048       region.getEncodedName() + " in PENDING_SPLIT state"));
1049     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
1050     RegionTransition rt = RegionTransition.createRegionTransition(
1051       RS_ZK_REQUEST_REGION_SPLIT, region.getRegionName(), serverName, payload);
1052     String node = ZKAssign.getNodeName(zkw, region.getEncodedName());
1053     if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) {
1054       throw new IOException("Failed create of ephemeral " + node);
1055     }
1056   }
1057 
1058   /**
1059    * Transitions an existing ephemeral node for the specified region which is
1060    * currently in the begin state to be in the end state. Master cleans up the
1061    * final SPLIT znode when it reads it (or if we crash, zk will clean it up).
1062    *
1063    * <p>Does not transition nodes from other states. If for some reason the
1064    * node could not be transitioned, the method returns -1. If the transition
1065    * is successful, the version of the node after transition is returned.
1066    *
1067    * <p>This method can fail and return false for three different reasons:
1068    * <ul><li>Node for this region does not exist</li>
1069    * <li>Node for this region is not in the begin state</li>
1070    * <li>After verifying the begin state, update fails because of wrong version
1071    * (this should never actually happen since an RS only does this transition
1072    * following a transition to the begin state. If two RS are conflicting, one would
1073    * fail the original transition to the begin state and not this transition)</li>
1074    * </ul>
1075    *
1076    * <p>Does not set any watches.
1077    *
1078    * <p>This method should only be used by a RegionServer when splitting a region.
1079    *
1080    * @param zkw zk reference
1081    * @param parent region to be transitioned to opened
1082    * @param a Daughter a of split
1083    * @param b Daughter b of split
1084    * @param serverName server event originates from
1085    * @param znodeVersion expected version of data before modification
1086    * @param beginState the expected current state the znode should be
1087    * @param endState the state to be transition to
1088    * @return version of node after transition, -1 if unsuccessful transition
1089    * @throws KeeperException if unexpected zookeeper exception
1090    * @throws IOException
1091    */
1092   public static int transitionSplittingNode(ZooKeeperWatcher zkw,
1093       HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
1094       final int znodeVersion, final EventType beginState,
1095       final EventType endState) throws KeeperException, IOException {
1096     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
1097     return ZKAssign.transitionNode(zkw, parent, serverName,
1098       beginState, endState, znodeVersion, payload);
1099   }
1100 
1101   List<JournalEntry> getJournal() {
1102     return journal;
1103   }
1104 }