View Javadoc

1   /*
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.replication.regionserver;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.hadoop.conf.Configuration;
26  import org.apache.hadoop.fs.FileSystem;
27  import org.apache.hadoop.fs.Path;
28  import org.apache.hadoop.hbase.regionserver.wal.LogActionsListener;
29  import org.apache.hadoop.hbase.replication.ReplicationZookeeperWrapper;
30  import org.apache.zookeeper.WatchedEvent;
31  import org.apache.zookeeper.Watcher;
32  
33  import java.io.IOException;
34  import java.util.ArrayList;
35  import java.util.List;
36  import java.util.Map;
37  import java.util.SortedMap;
38  import java.util.SortedSet;
39  import java.util.TreeSet;
40  import java.util.concurrent.atomic.AtomicBoolean;
41  
42  /**
43   * This class is responsible to manage all the replication
44   * sources. There are two classes of sources:
45   * <li> Normal sources are persistent and one per peer cluster</li>
46   * <li> Old sources are recovered from a failed region server and our
47   * only goal is to finish replicating the HLog queue it had up in ZK</li>
48   *
49   * When a region server dies, this class uses a watcher to get notified and it
50   * tries to grab a lock in order to transfer all the queues in a local
51   * old source.
52   */
53  public class ReplicationSourceManager implements LogActionsListener {
54  
55    private static final Log LOG =
56        LogFactory.getLog(ReplicationSourceManager.class);
57    // List of all the sources that read this RS's logs
58    private final List<ReplicationSourceInterface> sources;
59    // List of all the sources we got from died RSs
60    private final List<ReplicationSourceInterface> oldsources;
61    // Indicates if we are currently replicating
62    private final AtomicBoolean replicating;
63    // Helper for zookeeper
64    private final ReplicationZookeeperWrapper zkHelper;
65    // Indicates if the region server is closing
66    private final AtomicBoolean stopper;
67    // All logs we are currently trackign
68    private final SortedSet<String> hlogs;
69    private final Configuration conf;
70    private final FileSystem fs;
71    // The path to the latest log we saw, for new coming sources
72    private Path latestPath;
73    // List of all the other region servers in this cluster
74    private final List<String> otherRegionServers;
75    // Path to the hlogs directories
76    private final Path logDir;
77    // Path to the hlog archive
78    private final Path oldLogDir;
79  
80    /**
81     * Creates a replication manager and sets the watch on all the other
82     * registered region servers
83     * @param zkHelper the zk helper for replication
84     * @param conf the configuration to use
85     * @param stopper the stopper object for this region server
86     * @param fs the file system to use
87     * @param replicating the status of the replication on this cluster
88     * @param logDir the directory that contains all hlog directories of live RSs
89     * @param oldLogDir the directory where old logs are archived
90     */
91    public ReplicationSourceManager(final ReplicationZookeeperWrapper zkHelper,
92                                    final Configuration conf,
93                                    final AtomicBoolean stopper,
94                                    final FileSystem fs,
95                                    final AtomicBoolean replicating,
96                                    final Path logDir,
97                                    final Path oldLogDir) {
98      this.sources = new ArrayList<ReplicationSourceInterface>();
99      this.replicating = replicating;
100     this.zkHelper = zkHelper;
101     this.stopper = stopper;
102     this.hlogs = new TreeSet<String>();
103     this.oldsources = new ArrayList<ReplicationSourceInterface>();
104     this.conf = conf;
105     this.fs = fs;
106     this.logDir = logDir;
107     this.oldLogDir = oldLogDir;
108     List<String> otherRSs =
109         this.zkHelper.getRegisteredRegionServers(new OtherRegionServerWatcher());
110     this.otherRegionServers = otherRSs == null ? new ArrayList<String>() : otherRSs;
111   }
112 
113   /**
114    * Provide the id of the peer and a log key and this method will figure which
115    * hlog it belongs to and will log, for this region server, the current
116    * position. It will also clean old logs from the queue.
117    * @param log Path to the log currently being replicated from
118    * replication status in zookeeper. It will also delete older entries.
119    * @param id id of the peer cluster
120    * @param position current location in the log
121    * @param queueRecovered indicates if this queue comes from another region server
122    */
123   public void logPositionAndCleanOldLogs(Path log, String id, long position, boolean queueRecovered) {
124     String key = log.getName();
125     LOG.info("Going to report log #" + key + " for position " + position + " in " + log);
126     this.zkHelper.writeReplicationStatus(key.toString(), id, position);
127     synchronized (this.hlogs) {
128       if (!queueRecovered && this.hlogs.first() != key) {
129         SortedSet<String> hlogSet = this.hlogs.headSet(key);
130         LOG.info("Removing " + hlogSet.size() +
131             " logs in the list: " + hlogSet);
132         for (String hlog : hlogSet) {
133           this.zkHelper.removeLogFromList(hlog.toString(), id);
134         }
135         hlogSet.clear();
136       }
137     }
138   }
139 
140   /**
141    * Adds a normal source per registered peer cluster and tries to process all
142    * old region server hlog queues
143    */
144   public void init() throws IOException {
145     for (String id : this.zkHelper.getPeerClusters().keySet()) {
146       ReplicationSourceInterface src = addSource(id);
147       src.startup();
148     }
149     List<String> currentReplicators = this.zkHelper.getListOfReplicators(null);
150     synchronized (otherRegionServers) {
151       LOG.info("Current list of replicators: " + currentReplicators
152           + " other RSs: " + otherRegionServers);
153     }
154     // Look if there's anything to process after a restart
155     for (String rs : currentReplicators) {
156       synchronized (otherRegionServers) {
157         if (!this.otherRegionServers.contains(rs)) {
158           transferQueues(rs);
159         }
160       }
161     }
162   }
163 
164   /**
165    * Add a new normal source to this region server
166    * @param id the id of the peer cluster
167    * @return the created source
168    * @throws IOException
169    */
170   public ReplicationSourceInterface addSource(String id) throws IOException {
171     ReplicationSourceInterface src =
172         getReplicationSource(this.conf, this.fs, this, stopper, replicating, id);
173     this.sources.add(src);
174     synchronized (this.hlogs) {
175       if (this.hlogs.size() > 0) {
176         this.zkHelper.addLogToList(this.hlogs.first(),
177             this.sources.get(0).getPeerClusterZnode());
178         src.enqueueLog(this.latestPath);
179       }
180     }
181     return src;
182   }
183 
184   /**
185    * Terminate the replication on this region server
186    */
187   public void join() {
188     if (this.sources.size() == 0) {
189       this.zkHelper.deleteOwnRSZNode();
190     }
191     for (ReplicationSourceInterface source : this.sources) {
192       source.terminate();
193     }
194   }
195 
196   /**
197    * Get a copy of the hlogs of the first source on this rs
198    * @return a sorted set of hlog names
199    */
200   protected SortedSet<String> getHLogs() {
201     return new TreeSet(this.hlogs);
202   }
203 
204   /**
205    * Get a list of all the normal sources of this rs
206    * @return lis of all sources
207    */
208   public List<ReplicationSourceInterface> getSources() {
209     return this.sources;
210   }
211 
212   @Override
213   public void logRolled(Path newLog) {
214     if (this.sources.size() > 0) {
215       this.zkHelper.addLogToList(newLog.getName(),
216           this.sources.get(0).getPeerClusterZnode());
217     }
218     synchronized (this.hlogs) {
219       this.hlogs.add(newLog.getName());
220     }
221     this.latestPath = newLog;
222     // This only update the sources we own, not the recovered ones
223     for (ReplicationSourceInterface source : this.sources) {
224       source.enqueueLog(newLog);
225     }
226   }
227 
228   /**
229    * Get the ZK help of this manager
230    * @return the helper
231    */
232   public ReplicationZookeeperWrapper getRepZkWrapper() {
233     return zkHelper;
234   }
235 
236   /**
237    * Factory method to create a replication source
238    * @param conf the configuration to use
239    * @param fs the file system to use
240    * @param manager the manager to use
241    * @param stopper the stopper object for this region server
242    * @param replicating the status of the replication on this cluster
243    * @param peerClusterId the id of the peer cluster
244    * @return the created source
245    * @throws IOException
246    */
247   public ReplicationSourceInterface getReplicationSource(
248       final Configuration conf,
249       final FileSystem fs,
250       final ReplicationSourceManager manager,
251       final AtomicBoolean stopper,
252       final AtomicBoolean replicating,
253       final String peerClusterId) throws IOException {
254     ReplicationSourceInterface src;
255     try {
256       Class c = Class.forName(conf.get("replication.replicationsource.implementation",
257           ReplicationSource.class.getCanonicalName()));
258       src = (ReplicationSourceInterface) c.newInstance();
259     } catch (Exception e) {
260       LOG.warn("Passed replication source implemention throws errors, " +
261           "defaulting to ReplicationSource", e);
262       src = new ReplicationSource();
263 
264     }
265     src.init(conf, fs, manager, stopper, replicating, peerClusterId);
266     return src;
267   }
268 
269   /**
270    * Transfer all the queues of the specified to this region server.
271    * First it tries to grab a lock and if it works it will move the
272    * znodes and finally will delete the old znodes.
273    *
274    * It creates one old source for any type of source of the old rs.
275    * @param rsZnode
276    */
277   public void transferQueues(String rsZnode) {
278     // We try to lock that rs' queue directory
279     if (this.stopper.get()) {
280       LOG.info("Not transferring queue since we are shutting down");
281       return;
282     }
283     if (!this.zkHelper.lockOtherRS(rsZnode)) {
284       return;
285     }
286     LOG.info("Moving " + rsZnode + "'s hlogs to my queue");
287     SortedMap<String, SortedSet<String>> newQueues =
288         this.zkHelper.copyQueuesFromRS(rsZnode);
289     if (newQueues == null || newQueues.size() == 0) {
290       return;
291     }
292     this.zkHelper.deleteRsQueues(rsZnode);
293 
294     for (Map.Entry<String, SortedSet<String>> entry : newQueues.entrySet()) {
295       String peerId = entry.getKey();
296       try {
297         ReplicationSourceInterface src = getReplicationSource(this.conf,
298             this.fs, this, this.stopper, this.replicating, peerId);
299         this.oldsources.add(src);
300         for (String hlog : entry.getValue()) {
301           src.enqueueLog(new Path(this.oldLogDir, hlog));
302         }
303         src.startup();
304       } catch (IOException e) {
305         // TODO manage it
306         LOG.error("Failed creating a source", e);
307       }
308     }
309   }
310 
311   /**
312    * Clear the references to the specified old source
313    * @param src source to clear
314    */
315   public void closeRecoveredQueue(ReplicationSourceInterface src) {
316     LOG.info("Done with the recovered queue " + src.getPeerClusterZnode());
317     this.oldsources.remove(src);
318     this.zkHelper.deleteSource(src.getPeerClusterZnode());
319   }
320 
321   /**
322    * Watcher used to be notified of the other region server's death
323    * in the local cluster. It initiates the process to transfer the queues
324    * if it is able to grab the lock.
325    */
326   public class OtherRegionServerWatcher implements Watcher {
327     @Override
328     public void process(WatchedEvent watchedEvent) {
329       LOG.info(" event " + watchedEvent);
330       if (watchedEvent.getType().equals(Event.KeeperState.Expired) ||
331           watchedEvent.getType().equals(Event.KeeperState.Disconnected)) {
332         return;
333       }
334 
335       List<String> newRsList = (zkHelper.getRegisteredRegionServers(this));
336       if (newRsList == null) {
337         return;
338       } else {
339         synchronized (otherRegionServers) {
340           otherRegionServers.clear();
341           otherRegionServers.addAll(newRsList);
342         }
343       }
344       if (watchedEvent.getType().equals(Event.EventType.NodeDeleted)) {
345         LOG.info(watchedEvent.getPath() + " znode expired, trying to lock it");
346         String[] rsZnodeParts = watchedEvent.getPath().split("/");
347         transferQueues(rsZnodeParts[rsZnodeParts.length-1]);
348       }
349     }
350   }
351 
352   /**
353    * Get the directory where hlogs are archived
354    * @return the directory where hlogs are archived
355    */
356   public Path getOldLogDir() {
357     return this.oldLogDir;
358   }
359 
360   /**
361    * Get the directory where hlogs are stored by their RSs
362    * @return the directory where hlogs are stored by their RSs
363    */
364   public Path getLogDir() {
365     return this.logDir;
366   }
367 
368   /**
369    * Get the handle on the local file system
370    * @returnthe handle on the local file system
371    */
372   public FileSystem getFs() {
373     return this.fs;
374   }
375 
376 }