View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
22  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
23  import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;
24  
25  import java.io.IOException;
26  import java.io.InterruptedIOException;
27  import java.util.ArrayList;
28  import java.util.List;
29  import java.util.ListIterator;
30  import java.util.Map;
31  import java.util.concurrent.Callable;
32  import java.util.concurrent.ExecutionException;
33  import java.util.concurrent.Executors;
34  import java.util.concurrent.Future;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.ThreadPoolExecutor;
37  import java.util.concurrent.TimeUnit;
38  
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.apache.hadoop.hbase.classification.InterfaceAudience;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HRegionInfo;
44  import org.apache.hadoop.hbase.RegionTransition;
45  import org.apache.hadoop.hbase.Server;
46  import org.apache.hadoop.hbase.ServerName;
47  import org.apache.hadoop.hbase.catalog.CatalogTracker;
48  import org.apache.hadoop.hbase.catalog.MetaEditor;
49  import org.apache.hadoop.hbase.client.Mutation;
50  import org.apache.hadoop.hbase.client.Put;
51  import org.apache.hadoop.hbase.executor.EventType;
52  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionTransition.TransitionCode;
53  import org.apache.hadoop.hbase.util.Bytes;
54  import org.apache.hadoop.hbase.util.CancelableProgressable;
55  import org.apache.hadoop.hbase.util.ConfigUtil;
56  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
57  import org.apache.hadoop.hbase.util.HasThread;
58  import org.apache.hadoop.hbase.util.PairOfSameType;
59  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
60  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
61  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
62  import org.apache.zookeeper.KeeperException;
63  import org.apache.zookeeper.KeeperException.NodeExistsException;
64  import org.apache.zookeeper.data.Stat;
65  
66  import com.google.common.util.concurrent.ThreadFactoryBuilder;
67  
68  /**
69   * Executes region split as a "transaction".  Call {@link #prepare()} to setup
70   * the transaction, {@link #execute(Server, RegionServerServices)} to run the
71   * transaction and {@link #rollback(Server, RegionServerServices)} to cleanup if execute fails.
72   *
73   * <p>Here is an example of how you would use this class:
74   * <pre>
75   *  SplitTransaction st = new SplitTransaction(this.conf, parent, midKey)
76   *  if (!st.prepare()) return;
77   *  try {
78   *    st.execute(server, services);
79   *  } catch (IOException ioe) {
80   *    try {
81   *      st.rollback(server, services);
82   *      return;
83   *    } catch (RuntimeException e) {
84   *      myAbortable.abort("Failed split, abort");
85   *    }
86   *  }
87   * </Pre>
88   * <p>This class is not thread safe.  Caller needs ensure split is run by
89   * one thread only.
90   */
91  @InterfaceAudience.Private
92  public class SplitTransaction {
93    private static final Log LOG = LogFactory.getLog(SplitTransaction.class);
94  
95    /*
96     * Region to split
97     */
98    private final HRegion parent;
99    private HRegionInfo hri_a;
100   private HRegionInfo hri_b;
101   private long fileSplitTimeout = 30000;
102   private int znodeVersion = -1;
103   boolean useZKForAssignment;
104 
105   /*
106    * Row to split around
107    */
108   private final byte [] splitrow;
109 
110   /**
111    * Types to add to the transaction journal.
112    * Each enum is a step in the split transaction. Used to figure how much
113    * we need to rollback.
114    */
115   enum JournalEntry {
116     /**
117      * Set region as in transition, set it into SPLITTING state.
118      */
119     SET_SPLITTING_IN_ZK,
120     /**
121      * We created the temporary split data directory.
122      */
123     CREATE_SPLIT_DIR,
124     /**
125      * Closed the parent region.
126      */
127     CLOSED_PARENT_REGION,
128     /**
129      * The parent has been taken out of the server's online regions list.
130      */
131     OFFLINED_PARENT,
132     /**
133      * Started in on creation of the first daughter region.
134      */
135     STARTED_REGION_A_CREATION,
136     /**
137      * Started in on the creation of the second daughter region.
138      */
139     STARTED_REGION_B_CREATION,
140     /**
141      * Point of no return.
142      * If we got here, then transaction is not recoverable other than by
143      * crashing out the regionserver.
144      */
145     PONR
146   }
147 
148   /*
149    * Journal of how far the split transaction has progressed.
150    */
151   private final List<JournalEntry> journal = new ArrayList<JournalEntry>();
152 
153   /**
154    * Constructor
155    * @param r Region to split
156    * @param splitrow Row to split around
157    */
158   public SplitTransaction(final HRegion r, final byte [] splitrow) {
159     this.parent = r;
160     this.splitrow = splitrow;
161   }
162 
163   /**
164    * Does checks on split inputs.
165    * @return <code>true</code> if the region is splittable else
166    * <code>false</code> if it is not (e.g. its already closed, etc.).
167    */
168   public boolean prepare() {
169     if (!this.parent.isSplittable()) return false;
170     // Split key can be null if this region is unsplittable; i.e. has refs.
171     if (this.splitrow == null) return false;
172     HRegionInfo hri = this.parent.getRegionInfo();
173     parent.prepareToSplit();
174     // Check splitrow.
175     byte [] startKey = hri.getStartKey();
176     byte [] endKey = hri.getEndKey();
177     if (Bytes.equals(startKey, splitrow) ||
178         !this.parent.getRegionInfo().containsRow(splitrow)) {
179       LOG.info("Split row is not inside region key range or is equal to " +
180           "startkey: " + Bytes.toStringBinary(this.splitrow));
181       return false;
182     }
183     long rid = getDaughterRegionIdTimestamp(hri);
184     this.hri_a = new HRegionInfo(hri.getTable(), startKey, this.splitrow, false, rid);
185     this.hri_b = new HRegionInfo(hri.getTable(), this.splitrow, endKey, false, rid);
186     return true;
187   }
188 
189   /**
190    * Calculate daughter regionid to use.
191    * @param hri Parent {@link HRegionInfo}
192    * @return Daughter region id (timestamp) to use.
193    */
194   private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) {
195     long rid = EnvironmentEdgeManager.currentTimeMillis();
196     // Regionid is timestamp.  Can't be less than that of parent else will insert
197     // at wrong location in hbase:meta (See HBASE-710).
198     if (rid < hri.getRegionId()) {
199       LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() +
200         " but current time here is " + rid);
201       rid = hri.getRegionId() + 1;
202     }
203     return rid;
204   }
205 
206   private static IOException closedByOtherException = new IOException(
207       "Failed to close region: already closed by another thread");
208 
209   /**
210    * Prepare the regions and region files.
211    * @param server Hosting server instance.  Can be null when testing (won't try
212    * and update in zk if a null server)
213    * @param services Used to online/offline regions.
214    * @throws IOException If thrown, transaction failed.
215    *    Call {@link #rollback(Server, RegionServerServices)}
216    * @return Regions created
217    */
218   /* package */PairOfSameType<HRegion> createDaughters(final Server server,
219       final RegionServerServices services) throws IOException {
220     LOG.info("Starting split of region " + this.parent);
221     if ((server != null && server.isStopped()) ||
222         (services != null && services.isStopping())) {
223       throw new IOException("Server is stopped or stopping");
224     }
225     assert !this.parent.lock.writeLock().isHeldByCurrentThread():
226       "Unsafe to hold write lock while performing RPCs";
227 
228     // Coprocessor callback
229     if (this.parent.getCoprocessorHost() != null) {
230       this.parent.getCoprocessorHost().preSplit();
231     }
232 
233     // Coprocessor callback
234     if (this.parent.getCoprocessorHost() != null) {
235       this.parent.getCoprocessorHost().preSplit(this.splitrow);
236     }
237 
238     // If true, no cluster to write meta edits to or to update znodes in.
239     boolean testing = server == null? true:
240         server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
241     this.fileSplitTimeout = testing ? this.fileSplitTimeout :
242         server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
243           this.fileSplitTimeout);
244 
245     PairOfSameType<HRegion> daughterRegions = stepsBeforePONR(server, services, testing);
246 
247     List<Mutation> metaEntries = new ArrayList<Mutation>();
248     if (this.parent.getCoprocessorHost() != null) {
249       if (this.parent.getCoprocessorHost().
250           preSplitBeforePONR(this.splitrow, metaEntries)) {
251         throw new IOException("Coprocessor bypassing region "
252             + this.parent.getRegionNameAsString() + " split.");
253       }
254       try {
255         for (Mutation p : metaEntries) {
256           HRegionInfo.parseRegionName(p.getRow());
257         }
258       } catch (IOException e) {
259         LOG.error("Row key of mutation from coprossor is not parsable as region name."
260             + "Mutations from coprocessor should only for hbase:meta table.");
261         throw e;
262       }
263     }
264 
265     // This is the point of no return.  Adding subsequent edits to .META. as we
266     // do below when we do the daughter opens adding each to .META. can fail in
267     // various interesting ways the most interesting of which is a timeout
268     // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR
269     // then subsequent failures need to crash out this regionserver; the
270     // server shutdown processing should be able to fix-up the incomplete split.
271     // The offlined parent will have the daughters as extra columns.  If
272     // we leave the daughter regions in place and do not remove them when we
273     // crash out, then they will have their references to the parent in place
274     // still and the server shutdown fixup of .META. will point to these
275     // regions.
276     // We should add PONR JournalEntry before offlineParentInMeta,so even if
277     // OfflineParentInMeta timeout,this will cause regionserver exit,and then
278     // master ServerShutdownHandler will fix daughter & avoid data loss. (See
279     // HBase-4562).
280     this.journal.add(JournalEntry.PONR);
281 
282     // Edit parent in meta.  Offlines parent region and adds splita and splitb
283     // as an atomic update. See HBASE-7721. This update to META makes the region
284     // will determine whether the region is split or not in case of failures.
285     // If it is successful, master will roll-forward, if not, master will rollback
286     // and assign the parent region.
287     if (!testing && useZKForAssignment) {
288       if (metaEntries == null || metaEntries.isEmpty()) {
289         MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(), daughterRegions
290             .getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(), server
291             .getServerName());
292       } else {
293         offlineParentInMetaAndputMetaEntries(server.getCatalogTracker(), parent.getRegionInfo(),
294           daughterRegions.getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(),
295           server.getServerName(), metaEntries);
296       }
297     } else if (services != null && !useZKForAssignment) {
298       if (!services.reportRegionTransition(TransitionCode.SPLIT_PONR, parent.getRegionInfo(),
299         hri_a, hri_b)) {
300         // Passed PONR, let SSH clean it up
301         throw new IOException("Failed to notify master that split passed PONR: "
302             + parent.getRegionInfo().getRegionNameAsString());
303       }
304     }
305     return daughterRegions;
306   }
307 
308   public PairOfSameType<HRegion> stepsBeforePONR(final Server server,
309       final RegionServerServices services, boolean testing) throws IOException {
310     // Set ephemeral SPLITTING znode up in zk.  Mocked servers sometimes don't
311     // have zookeeper so don't do zk stuff if server or zookeeper is null
312     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
313       try {
314         createNodeSplitting(server.getZooKeeper(),
315           parent.getRegionInfo(), server.getServerName(), hri_a, hri_b);
316       } catch (KeeperException e) {
317         throw new IOException("Failed creating PENDING_SPLIT znode on " +
318           this.parent.getRegionNameAsString(), e);
319       }
320     } else if (services != null && !useZKForAssignment) {
321       if (!services.reportRegionTransition(TransitionCode.READY_TO_SPLIT,
322         parent.getRegionInfo(), hri_a, hri_b)) {
323         throw new IOException("Failed to get ok from master to split "
324             + parent.getRegionNameAsString());
325       }
326     }
327     this.journal.add(JournalEntry.SET_SPLITTING_IN_ZK);
328     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
329       // After creating the split node, wait for master to transition it
330       // from PENDING_SPLIT to SPLITTING so that we can move on. We want master
331       // knows about it and won't transition any region which is splitting.
332       znodeVersion = getZKNode(server, services);
333     }
334 
335     this.parent.getRegionFileSystem().createSplitsDir();
336     this.journal.add(JournalEntry.CREATE_SPLIT_DIR);
337 
338     Map<byte[], List<StoreFile>> hstoreFilesToSplit = null;
339     Exception exceptionToThrow = null;
340     try{
341       hstoreFilesToSplit = this.parent.close(false);
342     } catch (Exception e) {
343       exceptionToThrow = e;
344     }
345     if (exceptionToThrow == null && hstoreFilesToSplit == null) {
346       // The region was closed by a concurrent thread.  We can't continue
347       // with the split, instead we must just abandon the split.  If we
348       // reopen or split this could cause problems because the region has
349       // probably already been moved to a different server, or is in the
350       // process of moving to a different server.
351       exceptionToThrow = closedByOtherException;
352     }
353     if (exceptionToThrow != closedByOtherException) {
354       this.journal.add(JournalEntry.CLOSED_PARENT_REGION);
355     }
356     if (exceptionToThrow != null) {
357       if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
358       throw new IOException(exceptionToThrow);
359     }
360     if (!testing) {
361       services.removeFromOnlineRegions(this.parent, null);
362     }
363     this.journal.add(JournalEntry.OFFLINED_PARENT);
364 
365     // TODO: If splitStoreFiles were multithreaded would we complete steps in
366     // less elapsed time?  St.Ack 20100920
367     //
368     // splitStoreFiles creates daughter region dirs under the parent splits dir
369     // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will
370     // clean this up.
371     splitStoreFiles(hstoreFilesToSplit);
372 
373     // Log to the journal that we are creating region A, the first daughter
374     // region.  We could fail halfway through.  If we do, we could have left
375     // stuff in fs that needs cleanup -- a storefile or two.  Thats why we
376     // add entry to journal BEFORE rather than AFTER the change.
377     this.journal.add(JournalEntry.STARTED_REGION_A_CREATION);
378     HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a);
379 
380     // Ditto
381     this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);
382     HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b);
383     return new PairOfSameType<HRegion>(a, b);
384   }
385 
386   /**
387    * Perform time consuming opening of the daughter regions.
388    * @param server Hosting server instance.  Can be null when testing (won't try
389    * and update in zk if a null server)
390    * @param services Used to online/offline regions.
391    * @param a first daughter region
392    * @param a second daughter region
393    * @throws IOException If thrown, transaction failed.
394    *          Call {@link #rollback(Server, RegionServerServices)}
395    */
396   /* package */void openDaughters(final Server server,
397       final RegionServerServices services, HRegion a, HRegion b)
398       throws IOException {
399     boolean stopped = server != null && server.isStopped();
400     boolean stopping = services != null && services.isStopping();
401     // TODO: Is this check needed here?
402     if (stopped || stopping) {
403       LOG.info("Not opening daughters " +
404           b.getRegionInfo().getRegionNameAsString() +
405           " and " +
406           a.getRegionInfo().getRegionNameAsString() +
407           " because stopping=" + stopping + ", stopped=" + stopped);
408     } else {
409       // Open daughters in parallel.
410       DaughterOpener aOpener = new DaughterOpener(server, a);
411       DaughterOpener bOpener = new DaughterOpener(server, b);
412       aOpener.start();
413       bOpener.start();
414       try {
415         aOpener.join();
416         bOpener.join();
417       } catch (InterruptedException e) {
418         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
419       }
420       if (aOpener.getException() != null) {
421         throw new IOException("Failed " +
422           aOpener.getName(), aOpener.getException());
423       }
424       if (bOpener.getException() != null) {
425         throw new IOException("Failed " +
426           bOpener.getName(), bOpener.getException());
427       }
428       if (services != null) {
429         try {
430           if (useZKForAssignment) {
431             // add 2nd daughter first (see HBASE-4335)
432             services.postOpenDeployTasks(b, server.getCatalogTracker());
433           } else if (!services.reportRegionTransition(TransitionCode.SPLIT,
434               parent.getRegionInfo(), hri_a, hri_b)) {
435             throw new IOException("Failed to report split region to master: "
436               + parent.getRegionInfo().getShortNameToLog());
437           }
438           // Should add it to OnlineRegions
439           services.addToOnlineRegions(b);
440           if (useZKForAssignment) {
441             services.postOpenDeployTasks(a, server.getCatalogTracker());
442           }
443           services.addToOnlineRegions(a);
444         } catch (KeeperException ke) {
445           throw new IOException(ke);
446         }
447       }
448     }
449   }
450 
451   /**
452    * Finish off split transaction, transition the zknode
453    * @param server Hosting server instance.  Can be null when testing (won't try
454    * and update in zk if a null server)
455    * @param services Used to online/offline regions.
456    * @param a first daughter region
457    * @param a second daughter region
458    * @throws IOException If thrown, transaction failed.
459    *          Call {@link #rollback(Server, RegionServerServices)}
460    */
461   /* package */void transitionZKNode(final Server server,
462       final RegionServerServices services, HRegion a, HRegion b)
463       throws IOException {
464     // Tell master about split by updating zk.  If we fail, abort.
465     if (server != null && server.getZooKeeper() != null) {
466       try {
467         this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
468           parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
469           server.getServerName(), this.znodeVersion,
470           RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
471 
472         int spins = 0;
473         // Now wait for the master to process the split. We know it's done
474         // when the znode is deleted. The reason we keep tickling the znode is
475         // that it's possible for the master to miss an event.
476         do {
477           if (spins % 10 == 0) {
478             LOG.debug("Still waiting on the master to process the split for " +
479                 this.parent.getRegionInfo().getEncodedName());
480           }
481           Thread.sleep(100);
482           // When this returns -1 it means the znode doesn't exist
483           this.znodeVersion = transitionSplittingNode(server.getZooKeeper(),
484             parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
485             server.getServerName(), this.znodeVersion,
486             RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT);
487           spins++;
488         } while (this.znodeVersion != -1 && !server.isStopped()
489             && !services.isStopping());
490       } catch (Exception e) {
491         if (e instanceof InterruptedException) {
492           Thread.currentThread().interrupt();
493         }
494         throw new IOException("Failed telling master about split", e);
495       }
496     }
497 
498     
499 
500     // Leaving here, the splitdir with its dross will be in place but since the
501     // split was successful, just leave it; it'll be cleaned when parent is
502     // deleted and cleaned up.
503   }
504 
505   /**
506    * Wait for the splitting node to be transitioned from pending_split
507    * to splitting by master. That's how we are sure master has processed
508    * the event and is good with us to move on. If we don't get any update,
509    * we periodically transition the node so that master gets the callback.
510    * If the node is removed or is not in pending_split state any more,
511    * we abort the split.
512    */
513   private int getZKNode(final Server server,
514       final RegionServerServices services) throws IOException {
515     // Wait for the master to process the pending_split.
516     try {
517       int spins = 0;
518       Stat stat = new Stat();
519       ZooKeeperWatcher zkw = server.getZooKeeper();
520       ServerName expectedServer = server.getServerName();
521       String node = parent.getRegionInfo().getEncodedName();
522       while (!(server.isStopped() || services.isStopping())) {
523         if (spins % 5 == 0) {
524           LOG.debug("Still waiting for master to process "
525             + "the pending_split for " + node);
526           transitionSplittingNode(zkw, parent.getRegionInfo(),
527             hri_a, hri_b, expectedServer, -1, RS_ZK_REQUEST_REGION_SPLIT,
528             RS_ZK_REQUEST_REGION_SPLIT);
529         }
530         Thread.sleep(100);
531         spins++;
532         byte [] data = ZKAssign.getDataNoWatch(zkw, node, stat);
533         if (data == null) {
534           throw new IOException("Data is null, splitting node "
535             + node + " no longer exists");
536         }
537         RegionTransition rt = RegionTransition.parseFrom(data);
538         EventType et = rt.getEventType();
539         if (et == RS_ZK_REGION_SPLITTING) {
540           ServerName serverName = rt.getServerName();
541           if (!serverName.equals(expectedServer)) {
542             throw new IOException("Splitting node " + node + " is for "
543               + serverName + ", not us " + expectedServer);
544           }
545           byte [] payloadOfSplitting = rt.getPayload();
546           List<HRegionInfo> splittingRegions = HRegionInfo.parseDelimitedFrom(
547             payloadOfSplitting, 0, payloadOfSplitting.length);
548           assert splittingRegions.size() == 2;
549           HRegionInfo a = splittingRegions.get(0);
550           HRegionInfo b = splittingRegions.get(1);
551           if (!(hri_a.equals(a) && hri_b.equals(b))) {
552             throw new IOException("Splitting node " + node + " is for " + a + ", "
553               + b + ", not expected daughters: " + hri_a + ", " + hri_b);
554           }
555           // Master has processed it.
556           return stat.getVersion();
557         }
558         if (et != RS_ZK_REQUEST_REGION_SPLIT) {
559           throw new IOException("Splitting node " + node
560             + " moved out of splitting to " + et);
561         }
562       }
563       // Server is stopping/stopped
564       throw new IOException("Server is "
565         + (services.isStopping() ? "stopping" : "stopped"));
566     } catch (Exception e) {
567       if (e instanceof InterruptedException) {
568         Thread.currentThread().interrupt();
569       }
570       throw new IOException("Failed getting SPLITTING znode on "
571         + parent.getRegionNameAsString(), e);
572     }
573   }
574 
575   /**
576    * Run the transaction.
577    * @param server Hosting server instance.  Can be null when testing (won't try
578    * and update in zk if a null server)
579    * @param services Used to online/offline regions.
580    * @throws IOException If thrown, transaction failed.
581    *          Call {@link #rollback(Server, RegionServerServices)}
582    * @return Regions created
583    * @throws IOException
584    * @see #rollback(Server, RegionServerServices)
585    */
586   public PairOfSameType<HRegion> execute(final Server server,
587       final RegionServerServices services)
588   throws IOException {
589     useZKForAssignment =
590         server == null ? true : ConfigUtil.useZKForAssignment(server.getConfiguration());
591     PairOfSameType<HRegion> regions = createDaughters(server, services);
592     if (this.parent.getCoprocessorHost() != null) {
593       this.parent.getCoprocessorHost().preSplitAfterPONR();
594     }
595     return stepsAfterPONR(server, services, regions);
596   }
597 
598   public PairOfSameType<HRegion> stepsAfterPONR(final Server server,
599       final RegionServerServices services, PairOfSameType<HRegion> regions)
600       throws IOException {
601     openDaughters(server, services, regions.getFirst(), regions.getSecond());
602     if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
603       transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
604     }
605     // Coprocessor callback
606     if (this.parent.getCoprocessorHost() != null) {
607       this.parent.getCoprocessorHost().postSplit(regions.getFirst(), regions.getSecond());
608     }
609     return regions;
610   }
611 
612   private void offlineParentInMetaAndputMetaEntries(CatalogTracker catalogTracker,
613       HRegionInfo parent, HRegionInfo splitA, HRegionInfo splitB,
614       ServerName serverName, List<Mutation> metaEntries) throws IOException {
615     List<Mutation> mutations = metaEntries;
616     HRegionInfo copyOfParent = new HRegionInfo(parent);
617     copyOfParent.setOffline(true);
618     copyOfParent.setSplit(true);
619 
620     //Put for parent
621     Put putParent = MetaEditor.makePutFromRegionInfo(copyOfParent);
622     MetaEditor.addDaughtersToPut(putParent, splitA, splitB);
623     mutations.add(putParent);
624     
625     //Puts for daughters
626     Put putA = MetaEditor.makePutFromRegionInfo(splitA);
627     Put putB = MetaEditor.makePutFromRegionInfo(splitB);
628 
629     addLocation(putA, serverName, 1); //these are new regions, openSeqNum = 1 is fine.
630     addLocation(putB, serverName, 1);
631     mutations.add(putA);
632     mutations.add(putB);
633     MetaEditor.mutateMetaTable(catalogTracker, mutations);
634   }
635 
636   public Put addLocation(final Put p, final ServerName sn, long openSeqNum) {
637     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
638       Bytes.toBytes(sn.getHostAndPort()));
639     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
640       Bytes.toBytes(sn.getStartcode()));
641     p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SEQNUM_QUALIFIER,
642         Bytes.toBytes(openSeqNum));
643     return p;
644   }
645 
646   /*
647    * Open daughter region in its own thread.
648    * If we fail, abort this hosting server.
649    */
650   class DaughterOpener extends HasThread {
651     private final Server server;
652     private final HRegion r;
653     private Throwable t = null;
654 
655     DaughterOpener(final Server s, final HRegion r) {
656       super((s == null? "null-services": s.getServerName()) +
657         "-daughterOpener=" + r.getRegionInfo().getEncodedName());
658       setDaemon(true);
659       this.server = s;
660       this.r = r;
661     }
662 
663     /**
664      * @return Null if open succeeded else exception that causes us fail open.
665      * Call it after this thread exits else you may get wrong view on result.
666      */
667     Throwable getException() {
668       return this.t;
669     }
670 
671     @Override
672     public void run() {
673       try {
674         openDaughterRegion(this.server, r);
675       } catch (Throwable t) {
676         this.t = t;
677       }
678     }
679   }
680 
681   /**
682    * Open daughter regions, add them to online list and update meta.
683    * @param server
684    * @param daughter
685    * @throws IOException
686    * @throws KeeperException
687    */
688   void openDaughterRegion(final Server server, final HRegion daughter)
689   throws IOException, KeeperException {
690     HRegionInfo hri = daughter.getRegionInfo();
691     LoggingProgressable reporter = server == null ? null
692         : new LoggingProgressable(hri, server.getConfiguration().getLong(
693             "hbase.regionserver.split.daughter.open.log.interval", 10000));
694     daughter.openHRegion(reporter);
695   }
696 
697   static class LoggingProgressable implements CancelableProgressable {
698     private final HRegionInfo hri;
699     private long lastLog = -1;
700     private final long interval;
701 
702     LoggingProgressable(final HRegionInfo hri, final long interval) {
703       this.hri = hri;
704       this.interval = interval;
705     }
706 
707     @Override
708     public boolean progress() {
709       long now = System.currentTimeMillis();
710       if (now - lastLog > this.interval) {
711         LOG.info("Opening " + this.hri.getRegionNameAsString());
712         this.lastLog = now;
713       }
714       return true;
715     }
716   }
717 
718   private void splitStoreFiles(final Map<byte[], List<StoreFile>> hstoreFilesToSplit)
719       throws IOException {
720     if (hstoreFilesToSplit == null) {
721       // Could be null because close didn't succeed -- for now consider it fatal
722       throw new IOException("Close returned empty list of StoreFiles");
723     }
724     // The following code sets up a thread pool executor with as many slots as
725     // there's files to split. It then fires up everything, waits for
726     // completion and finally checks for any exception
727     int nbFiles = hstoreFilesToSplit.size();
728     if (nbFiles == 0) {
729       // no file needs to be splitted.
730       return;
731     }
732     ThreadFactoryBuilder builder = new ThreadFactoryBuilder();
733     builder.setNameFormat("StoreFileSplitter-%1$d");
734     ThreadFactory factory = builder.build();
735     ThreadPoolExecutor threadPool =
736       (ThreadPoolExecutor) Executors.newFixedThreadPool(nbFiles, factory);
737     List<Future<Void>> futures = new ArrayList<Future<Void>>(nbFiles);
738 
739     // Split each store file.
740     for (Map.Entry<byte[], List<StoreFile>> entry: hstoreFilesToSplit.entrySet()) {
741       for (StoreFile sf: entry.getValue()) {
742         StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf);
743         futures.add(threadPool.submit(sfs));
744       }
745     }
746     // Shutdown the pool
747     threadPool.shutdown();
748 
749     // Wait for all the tasks to finish
750     try {
751       boolean stillRunning = !threadPool.awaitTermination(
752           this.fileSplitTimeout, TimeUnit.MILLISECONDS);
753       if (stillRunning) {
754         threadPool.shutdownNow();
755         // wait for the thread to shutdown completely.
756         while (!threadPool.isTerminated()) {
757           Thread.sleep(50);
758         }
759         throw new IOException("Took too long to split the" +
760             " files and create the references, aborting split");
761       }
762     } catch (InterruptedException e) {
763       throw (InterruptedIOException)new InterruptedIOException().initCause(e);
764     }
765 
766     // Look for any exception
767     for (Future<Void> future: futures) {
768       try {
769         future.get();
770       } catch (InterruptedException e) {
771         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
772       } catch (ExecutionException e) {
773         throw new IOException(e);
774       }
775     }
776   }
777 
778   private void splitStoreFile(final byte[] family, final StoreFile sf) throws IOException {
779     HRegionFileSystem fs = this.parent.getRegionFileSystem();
780     String familyName = Bytes.toString(family);
781     fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false);
782     fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true);
783   }
784 
785   /**
786    * Utility class used to do the file splitting / reference writing
787    * in parallel instead of sequentially.
788    */
789   class StoreFileSplitter implements Callable<Void> {
790     private final byte[] family;
791     private final StoreFile sf;
792 
793     /**
794      * Constructor that takes what it needs to split
795      * @param family Family that contains the store file
796      * @param sf which file
797      */
798     public StoreFileSplitter(final byte[] family, final StoreFile sf) {
799       this.sf = sf;
800       this.family = family;
801     }
802 
803     public Void call() throws IOException {
804       splitStoreFile(family, sf);
805       return null;
806     }
807   }
808 
809   /**
810    * @param server Hosting server instance (May be null when testing).
811    * @param services
812    * @throws IOException If thrown, rollback failed.  Take drastic action.
813    * @return True if we successfully rolled back, false if we got to the point
814    * of no return and so now need to abort the server to minimize damage.
815    */
816   @SuppressWarnings("deprecation")
817   public boolean rollback(final Server server, final RegionServerServices services)
818   throws IOException {
819     // Coprocessor callback
820     if (this.parent.getCoprocessorHost() != null) {
821       this.parent.getCoprocessorHost().preRollBackSplit();
822     }
823 
824     boolean result = true;
825     ListIterator<JournalEntry> iterator =
826       this.journal.listIterator(this.journal.size());
827     // Iterate in reverse.
828     while (iterator.hasPrevious()) {
829       JournalEntry je = iterator.previous();
830       switch(je) {
831 
832       case SET_SPLITTING_IN_ZK:
833         if (server != null && server.getZooKeeper() != null && useZKForAssignment) {
834           cleanZK(server, this.parent.getRegionInfo());
835         } else if (services != null
836             && !useZKForAssignment
837             && !services.reportRegionTransition(TransitionCode.SPLIT_REVERTED,
838               parent.getRegionInfo(), hri_a, hri_b)) {
839           return false;
840         }
841         break;
842 
843       case CREATE_SPLIT_DIR:
844         this.parent.writestate.writesEnabled = true;
845         this.parent.getRegionFileSystem().cleanupSplitsDir();
846         break;
847 
848       case CLOSED_PARENT_REGION:
849         try {
850           // So, this returns a seqid but if we just closed and then reopened, we
851           // should be ok. On close, we flushed using sequenceid obtained from
852           // hosting regionserver so no need to propagate the sequenceid returned
853           // out of initialize below up into regionserver as we normally do.
854           // TODO: Verify.
855           this.parent.initialize();
856         } catch (IOException e) {
857           LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " +
858             this.parent.getRegionNameAsString(), e);
859           throw new RuntimeException(e);
860         }
861         break;
862 
863       case STARTED_REGION_A_CREATION:
864         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a);
865         break;
866 
867       case STARTED_REGION_B_CREATION:
868         this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b);
869         break;
870 
871       case OFFLINED_PARENT:
872         if (services != null) services.addToOnlineRegions(this.parent);
873         break;
874 
875       case PONR:
876         // We got to the point-of-no-return so we need to just abort. Return
877         // immediately.  Do not clean up created daughter regions.  They need
878         // to be in place so we don't delete the parent region mistakenly.
879         // See HBASE-3872.
880         return false;
881 
882       default:
883         throw new RuntimeException("Unhandled journal entry: " + je);
884       }
885     }
886     // Coprocessor callback
887     if (this.parent.getCoprocessorHost() != null) {
888       this.parent.getCoprocessorHost().postRollBackSplit();
889     }
890     return result;
891   }
892 
893   HRegionInfo getFirstDaughter() {
894     return hri_a;
895   }
896 
897   HRegionInfo getSecondDaughter() {
898     return hri_b;
899   }
900 
901   private static void cleanZK(final Server server, final HRegionInfo hri) {
902     try {
903       // Only delete if its in expected state; could have been hijacked.
904       if (!ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
905           RS_ZK_REQUEST_REGION_SPLIT, server.getServerName())) {
906         ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(),
907           RS_ZK_REGION_SPLITTING, server.getServerName());
908       }
909     } catch (KeeperException.NoNodeException e) {
910       LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e);
911     } catch (KeeperException e) {
912       server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e);
913     }
914   }
915 
916   /**
917    * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region.
918    * Create it ephemeral in case regionserver dies mid-split.
919    *
920    * <p>Does not transition nodes from other states.  If a node already exists
921    * for this region, a {@link NodeExistsException} will be thrown.
922    *
923    * @param zkw zk reference
924    * @param region region to be created as offline
925    * @param serverName server event originates from
926    * @throws KeeperException
927    * @throws IOException
928    */
929   public static void createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region,
930       final ServerName serverName, final HRegionInfo a,
931       final HRegionInfo b) throws KeeperException, IOException {
932     LOG.debug(zkw.prefix("Creating ephemeral node for " +
933       region.getEncodedName() + " in PENDING_SPLIT state"));
934     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
935     RegionTransition rt = RegionTransition.createRegionTransition(
936       RS_ZK_REQUEST_REGION_SPLIT, region.getRegionName(), serverName, payload);
937     String node = ZKAssign.getNodeName(zkw, region.getEncodedName());
938     if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) {
939       throw new IOException("Failed create of ephemeral " + node);
940     }
941   }
942 
943   /**
944    * Transitions an existing ephemeral node for the specified region which is
945    * currently in the begin state to be in the end state. Master cleans up the
946    * final SPLIT znode when it reads it (or if we crash, zk will clean it up).
947    *
948    * <p>Does not transition nodes from other states. If for some reason the
949    * node could not be transitioned, the method returns -1. If the transition
950    * is successful, the version of the node after transition is returned.
951    *
952    * <p>This method can fail and return false for three different reasons:
953    * <ul><li>Node for this region does not exist</li>
954    * <li>Node for this region is not in the begin state</li>
955    * <li>After verifying the begin state, update fails because of wrong version
956    * (this should never actually happen since an RS only does this transition
957    * following a transition to the begin state. If two RS are conflicting, one would
958    * fail the original transition to the begin state and not this transition)</li>
959    * </ul>
960    *
961    * <p>Does not set any watches.
962    *
963    * <p>This method should only be used by a RegionServer when splitting a region.
964    *
965    * @param zkw zk reference
966    * @param parent region to be transitioned to opened
967    * @param a Daughter a of split
968    * @param b Daughter b of split
969    * @param serverName server event originates from
970    * @param znodeVersion expected version of data before modification
971    * @param beginState the expected current state the znode should be
972    * @param endState the state to be transition to
973    * @return version of node after transition, -1 if unsuccessful transition
974    * @throws KeeperException if unexpected zookeeper exception
975    * @throws IOException
976    */
977   public static int transitionSplittingNode(ZooKeeperWatcher zkw,
978       HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName,
979       final int znodeVersion, final EventType beginState,
980       final EventType endState) throws KeeperException, IOException {
981     byte [] payload = HRegionInfo.toDelimitedByteArray(a, b);
982     return ZKAssign.transitionNode(zkw, parent, serverName,
983       beginState, endState, znodeVersion, payload);
984   }
985 }