View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.replication.regionserver;
21  
22  import java.io.IOException;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Random;
29  import java.util.SortedMap;
30  import java.util.SortedSet;
31  import java.util.TreeSet;
32  import java.util.concurrent.LinkedBlockingQueue;
33  import java.util.concurrent.RejectedExecutionException;
34  import java.util.concurrent.ThreadPoolExecutor;
35  import java.util.concurrent.TimeUnit;
36  import java.util.concurrent.atomic.AtomicBoolean;
37  
38  import org.apache.commons.logging.Log;
39  import org.apache.commons.logging.LogFactory;
40  import org.apache.hadoop.classification.InterfaceAudience;
41  import org.apache.hadoop.conf.Configuration;
42  import org.apache.hadoop.fs.FileSystem;
43  import org.apache.hadoop.fs.Path;
44  import org.apache.hadoop.hbase.HConstants;
45  import org.apache.hadoop.hbase.Stoppable;
46  import org.apache.hadoop.hbase.replication.ReplicationZookeeper;
47  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
48  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
49  import org.apache.zookeeper.KeeperException;
50  
51  import com.google.common.util.concurrent.ThreadFactoryBuilder;
52  
53  /**
54   * This class is responsible to manage all the replication
55   * sources. There are two classes of sources:
56   * <li> Normal sources are persistent and one per peer cluster</li>
57   * <li> Old sources are recovered from a failed region server and our
58   * only goal is to finish replicating the HLog queue it had up in ZK</li>
59   *
60   * When a region server dies, this class uses a watcher to get notified and it
61   * tries to grab a lock in order to transfer all the queues in a local
62   * old source.
63   */
64  @InterfaceAudience.Private
65  public class ReplicationSourceManager {
66    private static final Log LOG =
67        LogFactory.getLog(ReplicationSourceManager.class);
68    // List of all the sources that read this RS's logs
69    private final List<ReplicationSourceInterface> sources;
70    // List of all the sources we got from died RSs
71    private final List<ReplicationSourceInterface> oldsources;
72    // Indicates if we are currently replicating
73    private final AtomicBoolean replicating;
74    // Helper for zookeeper
75    private final ReplicationZookeeper zkHelper;
76    // All about stopping
77    private final Stoppable stopper;
78    // All logs we are currently tracking
79    private final Map<String, SortedSet<String>> hlogsById;
80    private final Configuration conf;
81    private final FileSystem fs;
82    // The path to the latest log we saw, for new coming sources
83    private Path latestPath;
84    // List of all the other region servers in this cluster
85    private final List<String> otherRegionServers = new ArrayList<String>();
86    // Path to the hlogs directories
87    private final Path logDir;
88    // Path to the hlog archive
89    private final Path oldLogDir;
90    // The number of ms that we wait before moving znodes, HBASE-3596
91    private final long sleepBeforeFailover;
92    // Homemade executer service for replication
93    private final ThreadPoolExecutor executor;
94    
95    private final Random rand;
96  
97  
98    /**
99     * Creates a replication manager and sets the watch on all the other
100    * registered region servers
101    * @param zkHelper the zk helper for replication
102    * @param conf the configuration to use
103    * @param stopper the stopper object for this region server
104    * @param fs the file system to use
105    * @param replicating the status of the replication on this cluster
106    * @param logDir the directory that contains all hlog directories of live RSs
107    * @param oldLogDir the directory where old logs are archived
108    */
109   public ReplicationSourceManager(final ReplicationZookeeper zkHelper,
110                                   final Configuration conf,
111                                   final Stoppable stopper,
112                                   final FileSystem fs,
113                                   final AtomicBoolean replicating,
114                                   final Path logDir,
115                                   final Path oldLogDir) {
116     this.sources = new ArrayList<ReplicationSourceInterface>();
117     this.replicating = replicating;
118     this.zkHelper = zkHelper;
119     this.stopper = stopper;
120     this.hlogsById = new HashMap<String, SortedSet<String>>();
121     this.oldsources = new ArrayList<ReplicationSourceInterface>();
122     this.conf = conf;
123     this.fs = fs;
124     this.logDir = logDir;
125     this.oldLogDir = oldLogDir;
126     this.sleepBeforeFailover = conf.getLong("replication.sleep.before.failover", 2000);
127     this.zkHelper.registerRegionServerListener(
128         new OtherRegionServerWatcher(this.zkHelper.getZookeeperWatcher()));
129     this.zkHelper.registerRegionServerListener(
130         new PeersWatcher(this.zkHelper.getZookeeperWatcher()));
131     this.zkHelper.listPeersIdsAndWatch();
132     // It's preferable to failover 1 RS at a time, but with good zk servers
133     // more could be processed at the same time.
134     int nbWorkers = conf.getInt("replication.executor.workers", 1);
135     // use a short 100ms sleep since this could be done inline with a RS startup
136     // even if we fail, other region servers can take care of it
137     this.executor = new ThreadPoolExecutor(nbWorkers, nbWorkers,
138         100, TimeUnit.MILLISECONDS,
139         new LinkedBlockingQueue<Runnable>());
140     ThreadFactoryBuilder tfb = new ThreadFactoryBuilder();
141     tfb.setNameFormat("ReplicationExecutor-%d");
142     this.executor.setThreadFactory(tfb.build());
143     this.rand = new Random();
144   }
145 
146   /**
147    * Provide the id of the peer and a log key and this method will figure which
148    * hlog it belongs to and will log, for this region server, the current
149    * position. It will also clean old logs from the queue.
150    * @param log Path to the log currently being replicated from
151    * replication status in zookeeper. It will also delete older entries.
152    * @param id id of the peer cluster
153    * @param position current location in the log
154    * @param queueRecovered indicates if this queue comes from another region server
155    * @param holdLogInZK if true then the log is retained in ZK
156    */
157   public void logPositionAndCleanOldLogs(Path log, String id, long position, 
158       boolean queueRecovered, boolean holdLogInZK) {
159     String key = log.getName();
160     this.zkHelper.writeReplicationStatus(key, id, position);
161     if (holdLogInZK) {
162      return;
163     }
164     synchronized (this.hlogsById) {
165       SortedSet<String> hlogs = this.hlogsById.get(id);
166       if (!queueRecovered && !hlogs.first().equals(key)) {
167         SortedSet<String> hlogSet = hlogs.headSet(key);
168         for (String hlog : hlogSet) {
169           this.zkHelper.removeLogFromList(hlog, id);
170         }
171         hlogSet.clear();
172       }
173     }
174   }
175 
176   /**
177    * Adds a normal source per registered peer cluster and tries to process all
178    * old region server hlog queues
179    */
180   public void init() throws IOException {
181     for (String id : this.zkHelper.getPeerClusters().keySet()) {
182       addSource(id);
183     }
184     List<String> currentReplicators = this.zkHelper.getListOfReplicators();
185     if (currentReplicators == null || currentReplicators.size() == 0) {
186       return;
187     }
188     synchronized (otherRegionServers) {
189       refreshOtherRegionServersList();
190       LOG.info("Current list of replicators: " + currentReplicators
191           + " other RSs: " + otherRegionServers);
192     }
193     // Look if there's anything to process after a restart
194     for (String rs : currentReplicators) {
195       synchronized (otherRegionServers) {
196         if (!this.otherRegionServers.contains(rs)) {
197           transferQueues(rs);
198         }
199       }
200     }
201   }
202 
203   /**
204    * Add a new normal source to this region server
205    * @param id the id of the peer cluster
206    * @return the source that was created
207    * @throws IOException
208    */
209   public ReplicationSourceInterface addSource(String id) throws IOException {
210     ReplicationSourceInterface src =
211         getReplicationSource(this.conf, this.fs, this, stopper, replicating, id);
212     synchronized (this.hlogsById) {
213       this.sources.add(src);
214       this.hlogsById.put(id, new TreeSet<String>());
215       // Add the latest hlog to that source's queue
216       if (this.latestPath != null) {
217         String name = this.latestPath.getName();
218         this.hlogsById.get(id).add(name);
219         try {
220           this.zkHelper.addLogToList(name, src.getPeerClusterZnode());
221         } catch (KeeperException ke) {
222           String message = "Cannot add log to zk for" +
223             " replication when creating a new source";
224           stopper.stop(message);
225           throw new IOException(message, ke);
226         }
227         src.enqueueLog(this.latestPath);
228       }
229     }
230     src.startup();
231     return src;
232   }
233 
234   /**
235    * Terminate the replication on this region server
236    */
237   public void join() {
238     this.executor.shutdown();
239     if (this.sources.size() == 0) {
240       this.zkHelper.deleteOwnRSZNode();
241     }
242     for (ReplicationSourceInterface source : this.sources) {
243       source.terminate("Region server is closing");
244     }
245   }
246 
247   /**
248    * Get a copy of the hlogs of the first source on this rs
249    * @return a sorted set of hlog names
250    */
251   protected Map<String, SortedSet<String>> getHLogs() {
252     return Collections.unmodifiableMap(hlogsById);
253   }
254 
255   /**
256    * Get a list of all the normal sources of this rs
257    * @return lis of all sources
258    */
259   public List<ReplicationSourceInterface> getSources() {
260     return this.sources;
261   }
262 
263   void preLogRoll(Path newLog) throws IOException {
264     if (!this.replicating.get()) {
265       LOG.warn("Replication stopped, won't add new log");
266       return;
267     }
268 
269     synchronized (this.hlogsById) {
270       String name = newLog.getName();
271       for (ReplicationSourceInterface source : this.sources) {
272         try {
273           this.zkHelper.addLogToList(name, source.getPeerClusterZnode());
274         } catch (KeeperException ke) {
275           throw new IOException("Cannot add log to zk for replication", ke);
276         }
277       }
278       for (SortedSet<String> hlogs : this.hlogsById.values()) {
279         if (this.sources.isEmpty()) {
280           // If there's no slaves, don't need to keep the old hlogs since
281           // we only consider the last one when a new slave comes in
282           hlogs.clear();
283         }
284         hlogs.add(name);
285       }
286     }
287 
288     this.latestPath = newLog;
289   }
290 
291   void postLogRoll(Path newLog) throws IOException {
292     if (!this.replicating.get()) {
293       LOG.warn("Replication stopped, won't add new log");
294       return;
295     }
296 
297     // This only updates the sources we own, not the recovered ones
298     for (ReplicationSourceInterface source : this.sources) {
299       source.enqueueLog(newLog);    
300     }
301   }
302 
303   /**
304    * Get the ZK help of this manager
305    * @return the helper
306    */
307   public ReplicationZookeeper getRepZkWrapper() {
308     return zkHelper;
309   }
310 
311   /**
312    * Factory method to create a replication source
313    * @param conf the configuration to use
314    * @param fs the file system to use
315    * @param manager the manager to use
316    * @param stopper the stopper object for this region server
317    * @param replicating the status of the replication on this cluster
318    * @param peerId the id of the peer cluster
319    * @return the created source
320    * @throws IOException
321    */
322   public ReplicationSourceInterface getReplicationSource(
323       final Configuration conf,
324       final FileSystem fs,
325       final ReplicationSourceManager manager,
326       final Stoppable stopper,
327       final AtomicBoolean replicating,
328       final String peerId) throws IOException {
329     ReplicationSourceInterface src;
330     try {
331       @SuppressWarnings("rawtypes")
332       Class c = Class.forName(conf.get("replication.replicationsource.implementation",
333           ReplicationSource.class.getCanonicalName()));
334       src = (ReplicationSourceInterface) c.newInstance();
335     } catch (Exception e) {
336       LOG.warn("Passed replication source implementation throws errors, " +
337           "defaulting to ReplicationSource", e);
338       src = new ReplicationSource();
339 
340     }
341     src.init(conf, fs, manager, stopper, replicating, peerId);
342     return src;
343   }
344 
345   /**
346    * Transfer all the queues of the specified to this region server.
347    * First it tries to grab a lock and if it works it will move the
348    * znodes and finally will delete the old znodes.
349    *
350    * It creates one old source for any type of source of the old rs.
351    * @param rsZnode
352    */
353   public void transferQueues(String rsZnode) {
354     NodeFailoverWorker transfer = new NodeFailoverWorker(rsZnode);
355     try {
356       this.executor.execute(transfer);
357     } catch (RejectedExecutionException ex) {
358       LOG.info("Cancelling the transfer of " + rsZnode +
359           " because of " + ex.getMessage());
360     }
361   }
362 
363   /**
364    * Clear the references to the specified old source
365    * @param src source to clear
366    */
367   public void closeRecoveredQueue(ReplicationSourceInterface src) {
368     LOG.info("Done with the recovered queue " + src.getPeerClusterZnode());
369     this.oldsources.remove(src);
370     this.zkHelper.deleteSource(src.getPeerClusterZnode(), false);
371   }
372 
373   /**
374    * Thie method first deletes all the recovered sources for the specified
375    * id, then deletes the normal source (deleting all related data in ZK).
376    * @param id The id of the peer cluster
377    */
378   public void removePeer(String id) {
379     LOG.info("Closing the following queue " + id + ", currently have "
380         + sources.size() + " and another "
381         + oldsources.size() + " that were recovered");
382     String terminateMessage = "Replication stream was removed by a user";
383     ReplicationSourceInterface srcToRemove = null;
384     List<ReplicationSourceInterface> oldSourcesToDelete =
385         new ArrayList<ReplicationSourceInterface>();
386     // First close all the recovered sources for this peer
387     for (ReplicationSourceInterface src : oldsources) {
388       if (id.equals(src.getPeerClusterId())) {
389         oldSourcesToDelete.add(src);
390       }
391     }
392     for (ReplicationSourceInterface src : oldSourcesToDelete) {
393       src.terminate(terminateMessage);
394       closeRecoveredQueue((src));
395     }
396     LOG.info("Number of deleted recovered sources for " + id + ": "
397         + oldSourcesToDelete.size());
398     // Now look for the one on this cluster
399     for (ReplicationSourceInterface src : this.sources) {
400       if (id.equals(src.getPeerClusterId())) {
401         srcToRemove = src;
402         break;
403       }
404     }
405     if (srcToRemove == null) {
406       LOG.error("The queue we wanted to close is missing " + id);
407       return;
408     }
409     srcToRemove.terminate(terminateMessage);
410     this.sources.remove(srcToRemove);
411     this.zkHelper.deleteSource(id, true);
412   }
413 
414   /**
415    * Reads the list of region servers from ZK and atomically clears our
416    * local view of it and replaces it with the updated list.
417    * 
418    * @return true if the local list of the other region servers was updated
419    * with the ZK data (even if it was empty),
420    * false if the data was missing in ZK
421    */
422   private boolean refreshOtherRegionServersList() {
423     List<String> newRsList = zkHelper.getRegisteredRegionServers();
424     if (newRsList == null) {
425       return false;
426     } else {
427       synchronized (otherRegionServers) {
428         otherRegionServers.clear();
429         otherRegionServers.addAll(newRsList);
430       }
431     }
432     return true;
433   }
434 
435   /**
436    * Watcher used to be notified of the other region server's death
437    * in the local cluster. It initiates the process to transfer the queues
438    * if it is able to grab the lock.
439    */
440   public class OtherRegionServerWatcher extends ZooKeeperListener {
441 
442     /**
443      * Construct a ZooKeeper event listener.
444      */
445     public OtherRegionServerWatcher(ZooKeeperWatcher watcher) {
446       super(watcher);
447     }
448 
449     /**
450      * Called when a new node has been created.
451      * @param path full path of the new node
452      */
453     public void nodeCreated(String path) {
454       refreshListIfRightPath(path);
455     }
456 
457     /**
458      * Called when a node has been deleted
459      * @param path full path of the deleted node
460      */
461     public void nodeDeleted(String path) {
462       if (stopper.isStopped()) {
463         return;
464       }
465       boolean cont = refreshListIfRightPath(path);
466       if (!cont) {
467         return;
468       }
469       LOG.info(path + " znode expired, trying to lock it");
470       transferQueues(ReplicationZookeeper.getZNodeName(path));
471     }
472 
473     /**
474      * Called when an existing node has a child node added or removed.
475      * @param path full path of the node whose children have changed
476      */
477     public void nodeChildrenChanged(String path) {
478       if (stopper.isStopped()) {
479         return;
480       }
481       refreshListIfRightPath(path);
482     }
483 
484     private boolean refreshListIfRightPath(String path) {
485       if (!path.startsWith(zkHelper.getZookeeperWatcher().rsZNode)) {
486         return false;
487       }
488       return refreshOtherRegionServersList();
489     }
490   }
491 
492   /**
493    * Watcher used to follow the creation and deletion of peer clusters.
494    */
495   public class PeersWatcher extends ZooKeeperListener {
496 
497     /**
498      * Construct a ZooKeeper event listener.
499      */
500     public PeersWatcher(ZooKeeperWatcher watcher) {
501       super(watcher);
502     }
503 
504     /**
505      * Called when a node has been deleted
506      * @param path full path of the deleted node
507      */
508     public void nodeDeleted(String path) {
509       List<String> peers = refreshPeersList(path);
510       if (peers == null) {
511         return;
512       }
513       if (zkHelper.isPeerPath(path)) {
514         String id = ReplicationZookeeper.getZNodeName(path);
515         removePeer(id);
516       }
517     }
518 
519     /**
520      * Called when an existing node has a child node added or removed.
521      * @param path full path of the node whose children have changed
522      */
523     public void nodeChildrenChanged(String path) {
524       List<String> peers = refreshPeersList(path);
525       if (peers == null) {
526         return;
527       }
528       for (String id : peers) {
529         try {
530           boolean added = zkHelper.connectToPeer(id);
531           if (added) {
532             addSource(id);
533           }
534         } catch (IOException e) {
535           // TODO manage better than that ?
536           LOG.error("Error while adding a new peer", e);
537         } catch (KeeperException e) {
538           LOG.error("Error while adding a new peer", e);
539         }
540       }
541     }
542 
543     /**
544      * Verify if this event is meant for us, and if so then get the latest
545      * peers' list from ZK. Also reset the watches.
546      * @param path path to check against
547      * @return A list of peers' identifiers if the event concerns this watcher,
548      * else null.
549      */
550     private List<String> refreshPeersList(String path) {
551       if (!path.startsWith(zkHelper.getPeersZNode())) {
552         return null;
553       }
554       return zkHelper.listPeersIdsAndWatch();
555     }
556   }
557 
558   /**
559    * Class responsible to setup new ReplicationSources to take care of the
560    * queues from dead region servers.
561    */
562   class NodeFailoverWorker extends Thread {
563 
564     private String rsZnode;
565 
566     /**
567      *
568      * @param rsZnode
569      */
570     public NodeFailoverWorker(String rsZnode) {
571       super("Failover-for-"+rsZnode);
572       this.rsZnode = rsZnode;
573     }
574 
575     @Override
576     public void run() {
577       // Wait a bit before transferring the queues, we may be shutting down.
578       // This sleep may not be enough in some cases.
579       try {
580         Thread.sleep(sleepBeforeFailover + (long) (rand.nextFloat() * sleepBeforeFailover));
581       } catch (InterruptedException e) {
582         LOG.warn("Interrupted while waiting before transferring a queue.");
583         Thread.currentThread().interrupt();
584       }
585       // We try to lock that rs' queue directory
586       if (stopper.isStopped()) {
587         LOG.info("Not transferring queue since we are shutting down");
588         return;
589       }
590       SortedMap<String, SortedSet<String>> newQueues = null;
591 
592       // check whether there is multi support. If yes, use it.
593       if (conf.getBoolean(HConstants.ZOOKEEPER_USEMULTI, true)) {
594         LOG.info("Atomically moving " + rsZnode + "'s hlogs to my queue");
595         newQueues = zkHelper.copyQueuesFromRSUsingMulti(rsZnode);
596       } else {
597         LOG.info("Moving " + rsZnode + "'s hlogs to my queue");
598         if (!zkHelper.lockOtherRS(rsZnode)) {
599           return;
600         }
601         newQueues = zkHelper.copyQueuesFromRS(rsZnode);
602         zkHelper.deleteRsQueues(rsZnode);
603       }
604       // process of copying over the failed queue is completed.
605       if (newQueues.isEmpty()) {
606         return;
607       }
608 
609       for (Map.Entry<String, SortedSet<String>> entry : newQueues.entrySet()) {
610         String peerId = entry.getKey();
611         try {
612           ReplicationSourceInterface src = getReplicationSource(conf,
613               fs, ReplicationSourceManager.this, stopper, replicating, peerId);
614           if (!zkHelper.getPeerClusters().containsKey(src.getPeerClusterId())) {
615             src.terminate("Recovered queue doesn't belong to any current peer");
616             break;
617           }
618           oldsources.add(src);
619           for (String hlog : entry.getValue()) {
620             src.enqueueLog(new Path(oldLogDir, hlog));
621           }
622           src.startup();
623         } catch (IOException e) {
624           // TODO manage it
625           LOG.error("Failed creating a source", e);
626         }
627       }
628     }
629   }
630 
631   /**
632    * Get the directory where hlogs are archived
633    * @return the directory where hlogs are archived
634    */
635   public Path getOldLogDir() {
636     return this.oldLogDir;
637   }
638 
639   /**
640    * Get the directory where hlogs are stored by their RSs
641    * @return the directory where hlogs are stored by their RSs
642    */
643   public Path getLogDir() {
644     return this.logDir;
645   }
646 
647   /**
648    * Get the handle on the local file system
649    * @return Handle on the local file system
650    */
651   public FileSystem getFs() {
652     return this.fs;
653   }
654 
655   /**
656    * Get a string representation of all the sources' metrics
657    */
658   public String getStats() {
659     StringBuffer stats = new StringBuffer();
660     for (ReplicationSourceInterface source : sources) {
661       stats.append("Normal source for cluster " + source.getPeerClusterId() + ": ");
662       stats.append(source.getStats() + "\n");
663     }
664     for (ReplicationSourceInterface oldSource : oldsources) {
665       stats.append("Recovered source for cluster/machine(s) " + oldSource.getPeerClusterId() + ": ");
666       stats.append(oldSource.getStats()+ "\n");
667     }
668     return stats.toString();
669   }
670 }