View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
22  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
23  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
24  
25  import java.io.IOException;
26  import java.util.ArrayList;
27  import java.util.List;
28  import java.util.ListIterator;
29  import java.util.Map;
30  import java.util.concurrent.Callable;
31  import java.util.concurrent.ExecutionException;
32  import java.util.concurrent.Executors;
33  import java.util.concurrent.Future;
34  import java.util.concurrent.ThreadFactory;
35  import java.util.concurrent.ThreadPoolExecutor;
36  import java.util.concurrent.TimeUnit;
37  
38  import org.apache.commons.logging.Log;
39  import org.apache.commons.logging.LogFactory;
40  import org.apache.hadoop.classification.InterfaceAudience;
41  import org.apache.hadoop.hbase.HRegionInfo;
42  import org.apache.hadoop.hbase.RegionTransition;
43  import org.apache.hadoop.hbase.Server;
44  import org.apache.hadoop.hbase.ServerName;
45  import org.apache.hadoop.hbase.catalog.MetaEditor;
46  import org.apache.hadoop.hbase.executor.EventType;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.CancelableProgressable;
49  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
50  import org.apache.hadoop.hbase.util.HasThread;
51  import org.apache.hadoop.hbase.util.PairOfSameType;
52  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
53  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
54  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
55  import org.apache.zookeeper.KeeperException;
56  import org.apache.zookeeper.KeeperException.NodeExistsException;
57  import org.apache.zookeeper.data.Stat;
58  
59  import com.google.common.util.concurrent.ThreadFactoryBuilder;
60  
61  /**
62   * Executes region split as a "transaction".  Call {@link #prepare()} to setup
63   * the transaction, {@link #execute(Server, RegionServerServices)} to run the
64   * transaction and {@link #rollback(Server, RegionServerServices)} to cleanup if execute fails.
65   *
66   * <p>Here is an example of how you would use this class:
67   * <pre>
68   *  SplitTransaction st = new SplitTransaction(this.conf, parent, midKey)
69   *  if (!st.prepare()) return;
70   *  try {
71   *    st.execute(server, services);
72   *  } catch (IOException ioe) {
73   *    try {
74   *      st.rollback(server, services);
75   *      return;
76   *    } catch (RuntimeException e) {
77   *      myAbortable.abort("Failed split, abort");
78   *    }
79   *  }
80   * </Pre>
81   * <p>This class is not thread safe.  Caller needs ensure split is run by
82   * one thread only.
83   */
84  @InterfaceAudience.Private
85  public class SplitTransaction {
86    private static final Log LOG = LogFactory.getLog(SplitTransaction.class);
87  
88    /*
89     * Region to split
90     */
91    private final HRegion parent;
92    private HRegionInfo hri_a;
93    private HRegionInfo hri_b;
94    private long fileSplitTimeout = 30000;
95    private int znodeVersion = -1;
96  
97    /*
98     * Row to split around
99     */
100   private final byte [] splitrow;
101 
102   /**
103    * Types to add to the transaction journal.
104    * Each enum is a step in the split transaction. Used to figure how much
105    * we need to rollback.
106    */
107   enum JournalEntry {
108     /**
109      * Set region as in transition, set it into SPLITTING state.
110      */
111     SET_SPLITTING_IN_ZK,
112     /**
113      * We created the temporary split data directory.
114      */
115     CREATE_SPLIT_DIR,
116     /**
117      * Closed the parent region.
118      */
119     CLOSED_PARENT_REGION,
120     /**
121      * The parent has been taken out of the server's online regions list.
122      */
123     OFFLINED_PARENT,
124     /**
125      * Started in on creation of the first daughter region.
126      */
127     STARTED_REGION_A_CREATION,
128     /**
129      * Started in on the creation of the second daughter region.
130      */
131     STARTED_REGION_B_CREATION,
132     /**
133      * Point of no return.
134      * If we got here, then transaction is not recoverable other than by
135      * crashing out the regionserver.
136      */
137     PONR
138   }
139 
140   /*
141    * Journal of how far the split transaction has progressed.
142    */
143   private final List<JournalEntry> journal = new ArrayList<JournalEntry>();
144 
145   /**
146    * Constructor
147    * @param r Region to split
148    * @param splitrow Row to split around
149    */
150   public SplitTransaction(final HRegion r, final byte [] splitrow) {
151     this.parent = r;
152     this.splitrow = splitrow;
153   }
154 
155   /**
156    * Does checks on split inputs.
157    * @return <code>true</code> if the region is splittable else
158    * <code>false</code> if it is not (e.g. its already closed, etc.).
159    */
160   public boolean prepare() {
161     if (!this.parent.isSplittable()) return false;
162     // Split key can be null if this region is unsplittable; i.e. has refs.
163     if (this.splitrow == null) return false;
164     HRegionInfo hri = this.parent.getRegionInfo();
165     parent.prepareToSplit();
166     // Check splitrow.
167     byte [] startKey = hri.getStartKey();
168     byte [] endKey = hri.getEndKey();
169     if (Bytes.equals(startKey, splitrow) ||
170         !this.parent.getRegionInfo().containsRow(splitrow)) {
171       LOG.info("Split row is not inside region key range or is equal to " +
172           "startkey: " + Bytes.toStringBinary(this.splitrow));
173       return false;
174     }
175     long rid = getDaughterRegionIdTimestamp(hri);
176     this.hri_a = new HRegionInfo(hri.getTable(), startKey, this.splitrow, false, rid);
177     this.hri_b = new HRegionInfo(hri.getTable(), this.splitrow, endKey, false, rid);
178     return true;
179   }
180 
181   /**
182    * Calculate daughter regionid to use.
183    * @param hri Parent {@link HRegionInfo}
184    * @return Daughter region id (timestamp) to use.
185    */
186   private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) {
187     long rid = EnvironmentEdgeManager.currentTimeMillis();
188     // Regionid is timestamp.  Can't be less than that of parent else will insert
189     // at wrong location in hbase:meta (See HBASE-710).
190     if (rid < hri.getRegionId()) {
191       LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() +
192         " but current time here is " + rid);
193       rid = hri.getRegionId() + 1;
194     }
195     return rid;
196   }
197 
198   private static IOException closedByOtherException = new IOException(
199       "Failed to close region: already closed by another thread");
200 
201   /**
202    * Prepare the regions and region files.
203    * @param server Hosting server instance.  Can be null when testing (won't try
204    * and update in zk if a null server)
205    * @param services Used to online/offline regions.
206    * @throws IOException If thrown, transaction failed.
207    *    Call {@link #rollback(Server, RegionServerServices)}
208    * @return Regions created
209    */
210   /* package */PairOfSameType<HRegion> createDaughters(final Server server,
211       final RegionServerServices services) throws IOException {
212     LOG.info("Starting split of region " + this.parent);
213     if ((server != null && server.isStopped()) ||
214         (services != null && services.isStopping())) {
215       throw new IOException("Server is stopped or stopping");
216     }
217     assert !this.parent.lock.writeLock().isHeldByCurrentThread():
218       "Unsafe to hold write lock while performing RPCs";
219 
220     // Coprocessor callback
221     if (this.parent.getCoprocessorHost() != null) {
222       this.parent.getCoprocessorHost().preSplit();
223     }
224 
225     // Coprocessor callback
226     if (this.parent.getCoprocessorHost() != null) {
227       this.parent.getCoprocessorHost().preSplit(this.splitrow);
228     }
229 
230     // If true, no cluster to write meta edits to or to update znodes in.
231     boolean testing = server == null? true:
232         server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
233     this.fileSplitTimeout = testing ? this.fileSplitTimeout :
234         server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
235           this.fileSplitTimeout);
236 
237     // Set ephemeral SPLITTING znode up in zk.  Mocked servers sometimes don't
238     // have zookeeper so don't do zk stuff if server or zookeeper is null
239     if (server != null && server.getZooKeeper() != null) {
240       try {
241         createNodeSplitting(server.getZooKeeper(),
242           parent.getRegionInfo(), server.getServerName(), hri_a, hri_b);
243       } catch (KeeperException e) {
244         throw new IOException("Failed creating PENDING_SPLIT znode on " +
245           this.parent.getRegionNameAsString(), e);
246       }
247     }
248     this.journal.add(JournalEntry.SET_SPLITTING_IN_ZK);
249     if (server != null && server.getZooKeeper() != null) {
250       // After creating the split node, wait for master to transition it
251       // from PENDING_SPLIT to SPLITTING so that we can move on. We want master
252       // knows about it and won't transition any region which is splitting.
253       znodeVersion = getZKNode(server, services);
254     }
255 
256     this.parent.getRegionFileSystem().createSplitsDir();
257     this.journal.add(JournalEntry.CREATE_SPLIT_DIR);
258 
259     Map<byte[], List<StoreFile>> hstoreFilesToSplit = null;
260     Exception exceptionToThrow = null;
261     try{
262       hstoreFilesToSplit = this.parent.close(false);
263     } catch (Exception e) {
264       exceptionToThrow = e;
265     }
266     if (exceptionToThrow == null && hstoreFilesToSplit == null) {
267       // The region was closed by a concurrent thread.  We can't continue
268       // with the split, instead we must just abandon the split.  If we
269       // reopen or split this could cause problems because the region has
270       // probably already been moved to a different server, or is in the
271       // process of moving to a different server.
272       exceptionToThrow = closedByOtherException;
273     }
274     if (exceptionToThrow != closedByOtherException) {
275       this.journal.add(JournalEntry.CLOSED_PARENT_REGION);
276     }
277     if (exceptionToThrow != null) {
278       if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
279       throw new IOException(exceptionToThrow);
280     }
281     if (!testing) {
282       services.removeFromOnlineRegions(this.parent, null);
283     }
284     this.journal.add(JournalEntry.OFFLINED_PARENT);
285 
286     // TODO: If splitStoreFiles were multithreaded would we complete steps in
287     // less elapsed time?  St.Ack 20100920
288     //
289     // splitStoreFiles creates daughter region dirs under the parent splits dir
290     // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will
291     // clean this up.
292     splitStoreFiles(hstoreFilesToSplit);
293 
294     // Log to the journal that we are creating region A, the first daughter
295     // region.  We could fail halfway through.  If we do, we could have left
296     // stuff in fs that needs cleanup -- a storefile or two.  Thats why we
297     // add entry to journal BEFORE rather than AFTER the change.
298     this.journal.add(JournalEntry.STARTED_REGION_A_CREATION);
299     HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a);
300 
301     // Ditto
302     this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);
303     HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b);
304 
305     // This is the point of no return.  Adding subsequent edits to hbase:meta as we
306     // do below when we do the daughter opens adding each to hbase:meta can fail in
307     // various interesting ways the most interesting of which is a timeout
308     // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR
309     // then subsequent failures need to crash out this regionserver; the
310     // server shutdown processing should be able to fix-up the incomplete split.
311     // The offlined parent will have the daughters as extra columns.  If
312     // we leave the daughter regions in place and do not remove them when we
313     // crash out, then they will have their references to the parent in place
314     // still and the server shutdown fixup of hbase:meta will point to these
315     // regions.
316     // We should add PONR JournalEntry before offlineParentInMeta,so even if
317     // OfflineParentInMeta timeout,this will cause regionserver exit,and then
318     // master ServerShutdownHandler will fix daughter & avoid data loss. (See
319     // HBase-4562).
320     this.journal.add(JournalEntry.PONR);
321 
322     // Edit parent in meta.  Offlines parent region and adds splita and splitb
323     // as an atomic update. See HBASE-7721. This update to hbase:meta makes the region
324     // will determine whether the region is split or not in case of failures.
325     // If it is successful, master will roll-forward, if not, master will rollback
326     // and assign the parent region.
327     if (!testing) {
328       MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(),
329           a.getRegionInfo(), b.getRegionInfo(), server.getServerName());
330     }
331     return new PairOfSameType<HRegion>(a, b);
332   }
333 
334   /**
335    * Perform time consuming opening of the daughter regions.
336    * @param server Hosting server instance.  Can be null when testing (won't try
337    * and update in zk if a null server)
338    * @param services Used to online/offline regions.
339    * @param a first daughter region
340    * @param a second daughter region
341    * @throws IOException If thrown, transaction failed.
342    *          Call {@link #rollback(Server, RegionServerServices)}
343    */
344   /* package */void openDaughters(final Server server,
345       final RegionServerServices services, HRegion a, HRegion b)
346       throws IOException {
347     boolean stopped = server != null && server.isStopped();
348     boolean stopping = services != null && services.isStopping();
349     // TODO: Is this check needed here?
350     if (stopped || stopping) {
351       LOG.info("Not opening daughters " +
352           b.getRegionInfo().getRegionNameAsString() +
353           " and " +
354           a.getRegionInfo().getRegionNameAsString() +
355           " because stopping=" + stopping + ", stopped=" + stopped);
356     } else {
357       // Open daughters in parallel.
358       DaughterOpener aOpener = new DaughterOpener(server, a);
359       DaughterOpener bOpener = new DaughterOpener(server, b);
360       aOpener.start();
361       bOpener.start();
362       try {
363         aOpener.join();
364         bOpener.join();
365       } catch (InterruptedException e) {
366         Thread.currentThread().interrupt();
367         throw new IOException("Interrupted " + e.getMessage());
368       }
369       if (aOpener.getException() != null) {
370         throw new IOException("Failed " +
371           aOpener.getName(), aOpener.getException());
372       }
373       if (bOpener.getException() != null) {
374         throw new IOException("Failed " +
375           bOpener.getName(), bOpener.getException());
376       }
377       if (services != null) {
378         try {
379           // add 2nd daughter first (see HBASE-4335)
380           services.postOpenDeployTasks(b, server.getCatalogTracker());
381           // Should add it to OnlineRegions
382           services.addToOnlineRegions(b);
383           services.postOpenDeployTasks(a, server.getCatalogTracker());
384           services.addToOnlineRegions(a);
385         } catch (KeeperException ke) {
386           throw new IOException(ke);
387         }
388       }
389     }
390   }
391 
392   /**
393    * Finish off split transaction, transition the zknode
394    * @param server Hosting server instance.  Can be null when testing (won't try
395    * and update in zk if a null server)
396    * @param services Used to online/offline regions.
397    * @param a first daughter region
398    * @param a second daughter region
399    * @throws IOException If thrown, transaction failed.
400    *          Call {@link #rollback(Server, RegionServerServices)}
401    */
402   /* package */void transitionZKNode(final Server server,
403       final RegionServerServices services, HRegion a, HRegion b)
404       throws IOException {
405     // Tell master about split by updating zk.  If we fail, abort.
406     if (server != null && server.getZooKeeper() != null) {
407       try {
408         this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
409           parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
410           server.getServerName(), this.znodeVersion,
411           RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
412 
413         int spins = 0;
414         // Now wait for the master to process the split. We know it's done
415         // when the znode is deleted. The reason we keep tickling the znode is
416         // that it's possible for the master to miss an event.
417         do {
418           if (spins % 10 == 0) {
419             LOG.debug("Still waiting on the master to process the split for " +
420                 this.parent.getRegionInfo().getEncodedName());
421           }
422           Thread.sleep(100);
423           // When this returns -1 it means the znode doesn't exist
424           this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
425             parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
426             server.getServerName(), this.znodeVersion,
427             RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT);
428           spins++;
429         } while (this.znodeVersion != -1 && !server.isStopped()
430             && !services.isStopping());
431       } catch (Exception e) {
432         if (e instanceof InterruptedException) {
433           Thread.currentThread().interrupt();
434         }
435         throw new IOException("Failed telling master about split", e);
436       }
437     }
438 
439     // Coprocessor callback
440     if (this.parent.getCoprocessorHost() != null) {
441       this.parent.getCoprocessorHost().postSplit(a,b);
442     }
443 
444     // Leaving here, the splitdir with its dross will be in place but since the
445     // split was successful, just leave it; it'll be cleaned when parent is
446     // deleted and cleaned up.
447   }
448 
449   /**
450    * Wait for the splitting node to be transitioned from pending_split
451    * to splitting by master. That's how we are sure master has processed
452    * the event and is good with us to move on. If we don't get any update,
453    * we periodically transition the node so that master gets the callback.
454    * If the node is removed or is not in pending_split state any more,
455    * we abort the split.
456    */
457   private int getZKNode(final Server server,
458       final RegionServerServices services) throws IOException {
459     // Wait for the master to process the pending_split.
460     try {
461       int spins = 0;
462       Stat stat = new Stat();
463       ZooKeeperWatcher zkw = server.getZooKeeper();
464       ServerName expectedServer = server.getServerName();
465       String node = parent.getRegionInfo().getEncodedName();
466       while (!(server.isStopped() || services.isStopping())) {
467         if (spins % 5 == 0) {
468           LOG.debug("Still waiting for master to process "
469             + "the pending_split for " + node);
470           transitionSplittingNode(zkw, parent.getRegionInfo(),
471             hri_a, hri_b, expectedServer, -1, RS_ZK_REQUEST_REGION_SPLIT,
472             RS_ZK_REQUEST_REGION_SPLIT);
473         }
474         Thread.sleep(100);
475         spins++;
476         byte [] data = ZKAssign.getDataNoWatch(zkw, node, stat);
477         if (data == null) {
478           throw new IOException("Data is null, splitting node "
479             + node + " no longer exists");
480         }
481         RegionTransition rt = RegionTransition.parseFrom(data);
482         EventType et = rt.getEventType();
483         if (et == RS_ZK_REGION_SPLITTING) {
484           ServerName serverName = rt.getServerName();
485           if (!serverName.equals(expectedServer)) {
486             throw new IOException("Splitting node " + node + " is for "
487               + serverName + ", not us " + expectedServer);
488           }
489           byte [] payloadOfSplitting = rt.getPayload();
490           List<HRegionInfo> splittingRegions = HRegionInfo.parseDelimitedFrom(
491             payloadOfSplitting, 0, payloadOfSplitting.length);
492           assert splittingRegions.size() == 2;
493           HRegionInfo a = splittingRegions.get(0);
494           HRegionInfo b = splittingRegions.get(1);
495           if (!(hri_a.equals(a) && hri_b.equals(b))) {
496             throw new IOException("Splitting node " + node + " is for " + a + ", "
497               + b + ", not expected daughters: " + hri_a + ", " + hri_b);
498           }
499           // Master has processed it.
500           return stat.getVersion();
501         }
502         if (et != RS_ZK_REQUEST_REGION_SPLIT) {
503           throw new IOException("Splitting node " + node
504             + " moved out of splitting to " + et);
505         }
506       }
507       // Server is stopping/stopped
508       throw new IOException("Server is "
509         + (services.isStopping() ? "stopping" : "stopped"));
510     } catch (Exception e) {
511       if (e instanceof InterruptedException) {
512         Thread.currentThread().interrupt();
513       }
514       throw new IOException("Failed getting SPLITTING znode on "
515         + parent.getRegionNameAsString(), e);
516     }
517   }
518 
519   /**
520    * Run the transaction.
521    * @param server Hosting server instance.  Can be null when testing (won't try
522    * and update in zk if a null server)
523    * @param services Used to online/offline regions.
524    * @throws IOException If thrown, transaction failed.
525    *          Call {@link #rollback(Server, RegionServerServices)}
526    * @return Regions created
527    * @throws IOException
528    * @see #rollback(Server, RegionServerServices)
529    */
530   public PairOfSameType<HRegion> execute(final Server server,
531       final RegionServerServices services)
532   throws IOException {
533     PairOfSameType<HRegion> regions = createDaughters(server, services);
534     openDaughters(server, services, regions.getFirst(), regions.getSecond());
535     transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
536     return regions;
537   }
538 
539   /*
540    * Open daughter region in its own thread.
541    * If we fail, abort this hosting server.
542    */
543   class DaughterOpener extends HasThread {
544     private final Server server;
545     private final HRegion r;
546     private Throwable t = null;
547 
548     DaughterOpener(final Server s, final HRegion r) {
549       super((s == null? "null-services": s.getServerName()) +
550         "-daughterOpener=" + r.getRegionInfo().getEncodedName());
551       setDaemon(true);
552       this.server = s;
553       this.r = r;
554     }
555 
556     /**
557      * @return Null if open succeeded else exception that causes us fail open.
558      * Call it after this thread exits else you may get wrong view on result.
559      */
560     Throwable getException() {
561       return this.t;
562     }
563 
564     @Override
565     public void run() {
566       try {
567         openDaughterRegion(this.server, r);
568       } catch (Throwable t) {
569         this.t = t;
570       }
571     }
572   }
573 
574   /**
575    * Open daughter regions, add them to online list and update meta.
576    * @param server
577    * @param daughter
578    * @throws IOException
579    * @throws KeeperException
580    */
581   void openDaughterRegion(final Server server, final HRegion daughter)
582   throws IOException, KeeperException {
583     HRegionInfo hri = daughter.getRegionInfo();
584     LoggingProgressable reporter = server == null ? null
585         : new LoggingProgressable(hri, server.getConfiguration().getLong(
586             "hbase.regionserver.split.daughter.open.log.interval", 10000));
587     daughter.openHRegion(reporter);
588   }
589 
590   static class LoggingProgressable implements CancelableProgressable {
591     private final HRegionInfo hri;
592     private long lastLog = -1;
593     private final long interval;
594 
595     LoggingProgressable(final HRegionInfo hri, final long interval) {
596       this.hri = hri;
597       this.interval = interval;
598     }
599 
600     @Override
601     public boolean progress() {
602       long now = System.currentTimeMillis();
603       if (now - lastLog > this.interval) {
604         LOG.info("Opening " + this.hri.getRegionNameAsString());
605         this.lastLog = now;
606       }
607       return true;
608     }
609   }
610 
611   private void splitStoreFiles(final Map<byte[], List<StoreFile>> hstoreFilesToSplit)
612       throws IOException {
613     if (hstoreFilesToSplit == null) {
614       // Could be null because close didn't succeed -- for now consider it fatal
615       throw new IOException("Close returned empty list of StoreFiles");
616     }
617     // The following code sets up a thread pool executor with as many slots as
618     // there's files to split. It then fires up everything, waits for
619     // completion and finally checks for any exception
620     int nbFiles = hstoreFilesToSplit.size();
621     if (nbFiles == 0) {
622       // no file needs to be splitted.
623       return;
624     }
625     ThreadFactoryBuilder builder = new ThreadFactoryBuilder();
626     builder.setNameFormat("StoreFileSplitter-%1$d");
627     ThreadFactory factory = builder.build();
628     ThreadPoolExecutor threadPool =
629       (ThreadPoolExecutor) Executors.newFixedThreadPool(nbFiles, factory);
630     List<Future<Void>> futures = new ArrayList<Future<Void>>(nbFiles);
631 
632     // Split each store file.
633     for (Map.Entry<byte[], List<StoreFile>> entry: hstoreFilesToSplit.entrySet()) {
634       for (StoreFile sf: entry.getValue()) {
635         StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf);
636         futures.add(threadPool.submit(sfs));
637       }
638     }
639     // Shutdown the pool
640     threadPool.shutdown();
641 
642     // Wait for all the tasks to finish
643     try {
644       boolean stillRunning = !threadPool.awaitTermination(
645           this.fileSplitTimeout, TimeUnit.MILLISECONDS);
646       if (stillRunning) {
647         threadPool.shutdownNow();
648         // wait for the thread to shutdown completely.
649         while (!threadPool.isTerminated()) {
650           Thread.sleep(50);
651         }
652         throw new IOException("Took too long to split the" +
653             " files and create the references, aborting split");
654       }
655     } catch (InterruptedException e) {
656       Thread.currentThread().interrupt();
657       throw new IOException("Interrupted while waiting for file splitters", e);
658     }
659 
660     // Look for any exception
661     for (Future<Void> future: futures) {
662       try {
663         future.get();
664       } catch (InterruptedException e) {
665         Thread.currentThread().interrupt();
666         throw new IOException(
667             "Interrupted while trying to get the results of file splitters", e);
668       } catch (ExecutionException e) {
669         throw new IOException(e);
670       }
671     }
672   }
673 
674   private void splitStoreFile(final byte[] family, final StoreFile sf) throws IOException {
675     HRegionFileSystem fs = this.parent.getRegionFileSystem();
676     String familyName = Bytes.toString(family);
677     fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false);
678     fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true);
679   }
680 
681   /**
682    * Utility class used to do the file splitting / reference writing
683    * in parallel instead of sequentially.
684    */
685   class StoreFileSplitter implements Callable<Void> {
686     private final byte[] family;
687     private final StoreFile sf;
688 
689     /**
690      * Constructor that takes what it needs to split
691      * @param family Family that contains the store file
692      * @param sf which file
693      */
694     public StoreFileSplitter(final byte[] family, final StoreFile sf) {
695       this.sf = sf;
696       this.family = family;
697     }
698 
699     public Void call() throws IOException {
700       splitStoreFile(family, sf);
701       return null;
702     }
703   }
704 
705   /**
706    * @param server Hosting server instance (May be null when testing).
707    * @param services
708    * @throws IOException If thrown, rollback failed.  Take drastic action.
709    * @return True if we successfully rolled back, false if we got to the point
710    * of no return and so now need to abort the server to minimize damage.
711    */
712   @SuppressWarnings("deprecation")
713   public boolean rollback(final Server server, final RegionServerServices services)
714   throws IOException {
715     // Coprocessor callback
716     if (this.parent.getCoprocessorHost() != null) {
717       this.parent.getCoprocessorHost().preRollBackSplit();
718     }
719 
720     boolean result = true;
721     ListIterator<JournalEntry> iterator =
722       this.journal.listIterator(this.journal.size());
723     // Iterate in reverse.
724     while (iterator.hasPrevious()) {
725       JournalEntry je = iterator.previous();
726       switch(je) {
727 
728       case SET_SPLITTING_IN_ZK:
729         if (server != null && server.getZooKeeper() != null) {
730           cleanZK(server, this.parent.getRegionInfo());
731         }
732         break;
733 
734       case CREATE_SPLIT_DIR:
735         this.parent.writestate.writesEnabled = true;
736         this.parent.getRegionFileSystem().cleanupSplitsDir();
737         break;
738 
739       case CLOSED_PARENT_REGION:
740         try {
741           // So, this returns a seqid but if we just closed and then reopened, we
742           // should be ok. On close, we flushed using sequenceid obtained from
743           // hosting regionserver so no need to propagate the sequenceid returned
744           // out of initialize below up into regionserver as we normally do.
745           // TODO: Verify.
746           this.parent.initialize();
747         } catch (IOException e) {
748           LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " +
749             this.parent.getRegionNameAsString(), e);
750           throw new RuntimeException(e);
751         }
752         break;
753 
754       case STARTED_REGION_A_CREATION:
755         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a);
756         break;
757 
758       case STARTED_REGION_B_CREATION:
759         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b);
760         break;
761 
762       case OFFLINED_PARENT:
763         if (services != null) services.addToOnlineRegions(this.parent);
764         break;
765 
766       case PONR:
767         // We got to the point-of-no-return so we need to just abort. Return
768         // immediately.  Do not clean up created daughter regions.  They need
769         // to be in place so we don't delete the parent region mistakenly.
770         // See HBASE-3872.
771         return false;
772 
773       default:
774         throw new RuntimeException("Unhandled journal entry: " + je);
775       }
776     }
777     // Coprocessor callback
778     if (this.parent.getCoprocessorHost() != null) {
779       this.parent.getCoprocessorHost().postRollBackSplit();
780     }
781     return result;
782   }
783 
784   HRegionInfo getFirstDaughter() {
785     return hri_a;
786   }
787 
788   HRegionInfo getSecondDaughter() {
789     return hri_b;
790   }
791 
792   private static void cleanZK(final Server server, final HRegionInfo hri) {
793     try {
794       // Only delete if its in expected state; could have been hijacked.
795       if (!ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
796           RS_ZK_REQUEST_REGION_SPLIT, server.getServerName())) {
797         ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
798           RS_ZK_REGION_SPLITTING, server.getServerName());
799       }
800     } catch (KeeperException.NoNodeException e) {
801       LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
802     } catch (KeeperException e) {
803       server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
804     }
805   }
806 
807   /**
808    * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region.
809    * Create it ephemeral in case regionserver dies mid-split.
810    *
811    * <p>Does not transition nodes from other states.  If a node already exists
812    * for this region, a {@link NodeExistsException} will be thrown.
813    *
814    * @param zkw zk reference
815    * @param region region to be created as offline
816    * @param serverName server event originates from
817    * @throws KeeperException
818    * @throws IOException
819    */
820   public static void createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region,
821       final ServerName serverName, final HRegionInfo a,
822       final HRegionInfo b) throws KeeperException, IOException {
823     LOG.debug(zkw.prefix("Creating ephemeral node for " +
824       region.getEncodedName() + " in PENDING_SPLIT state"));
825     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
826     RegionTransition rt = RegionTransition.createRegionTransition(
827       RS_ZK_REQUEST_REGION_SPLIT, region.getRegionName(), serverName, payload);
828     String node = ZKAssign.getNodeName(zkw, region.getEncodedName());
829     if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) {
830       throw new IOException("Failed create of ephemeral " + node);
831     }
832   }
833 
834   /**
835    * Transitions an existing ephemeral node for the specified region which is
836    * currently in the begin state to be in the end state. Master cleans up the
837    * final SPLIT znode when it reads it (or if we crash, zk will clean it up).
838    *
839    * <p>Does not transition nodes from other states. If for some reason the
840    * node could not be transitioned, the method returns -1. If the transition
841    * is successful, the version of the node after transition is returned.
842    *
843    * <p>This method can fail and return false for three different reasons:
844    * <ul><li>Node for this region does not exist</li>
845    * <li>Node for this region is not in the begin state</li>
846    * <li>After verifying the begin state, update fails because of wrong version
847    * (this should never actually happen since an RS only does this transition
848    * following a transition to the begin state. If two RS are conflicting, one would
849    * fail the original transition to the begin state and not this transition)</li>
850    * </ul>
851    *
852    * <p>Does not set any watches.
853    *
854    * <p>This method should only be used by a RegionServer when splitting a region.
855    *
856    * @param zkw zk reference
857    * @param parent region to be transitioned to opened
858    * @param a Daughter a of split
859    * @param b Daughter b of split
860    * @param serverName server event originates from
861    * @param znodeVersion expected version of data before modification
862    * @param beginState the expected current state the znode should be
863    * @param endState the state to be transition to
864    * @return version of node after transition, -1 if unsuccessful transition
865    * @throws KeeperException if unexpected zookeeper exception
866    * @throws IOException
867    */
868   public static int transitionSplittingNode(ZooKeeperWatcher zkw,
869       HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
870       final int znodeVersion, final EventType beginState,
871       final EventType endState) throws KeeperException, IOException {
872     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
873     return ZKAssign.transitionNode(zkw, parent, serverName,
874       beginState, endState, znodeVersion, payload);
875   }
876 }