View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.replication;
20  
21  import java.util.ArrayList;
22  import java.util.List;
23  import java.util.SortedMap;
24  import java.util.SortedSet;
25  import java.util.TreeMap;
26  import java.util.TreeSet;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.hbase.Abortable;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.exceptions.DeserializationException;
34  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
36  import org.apache.hadoop.hbase.util.Bytes;
37  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
38  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
39  import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp;
40  import org.apache.zookeeper.KeeperException;
41  
42  /**
43   * This class provides an implementation of the ReplicationQueues interface using Zookeeper. The
44   * base znode that this class works at is the myQueuesZnode. The myQueuesZnode contains a list of
45   * all outstanding HLog files on this region server that need to be replicated. The myQueuesZnode is
46   * the regionserver name (a concatenation of the region server’s hostname, client port and start
47   * code). For example:
48   *
49   * /hbase/replication/rs/hostname.example.org,6020,1234
50   *
51   * Within this znode, the region server maintains a set of HLog replication queues. These queues are
52   * represented by child znodes named using there give queue id. For example:
53   *
54   * /hbase/replication/rs/hostname.example.org,6020,1234/1
55   * /hbase/replication/rs/hostname.example.org,6020,1234/2
56   *
57   * Each queue has one child znode for every HLog that still needs to be replicated. The value of
58   * these HLog child znodes is the latest position that has been replicated. This position is updated
59   * every time a HLog entry is replicated. For example:
60   *
61   * /hbase/replication/rs/hostname.example.org,6020,1234/1/23522342.23422 [VALUE: 254]
62   */
63  public class ReplicationQueuesZKImpl extends ReplicationStateZKBase implements ReplicationQueues {
64  
65    /** Znode containing all replication queues for this region server. */
66    private String myQueuesZnode;
67    /** Name of znode we use to lock during failover */
68    private final static String RS_LOCK_ZNODE = "lock";
69  
70    private static final Log LOG = LogFactory.getLog(ReplicationQueuesZKImpl.class);
71  
72    public ReplicationQueuesZKImpl(final ZooKeeperWatcher zk, Configuration conf, 
73        Abortable abortable) {
74      super(zk, conf, abortable);
75    }
76  
77    @Override
78    public void init(String serverName) throws ReplicationException {
79      this.myQueuesZnode = ZKUtil.joinZNode(this.queuesZNode, serverName);
80      try {
81        ZKUtil.createWithParents(this.zookeeper, this.myQueuesZnode);
82      } catch (KeeperException e) {
83        throw new ReplicationException("Could not initialize replication queues.", e);
84      }
85    }
86  
87    @Override
88    public void removeQueue(String queueId) {
89      try {
90        ZKUtil.deleteNodeRecursively(this.zookeeper, ZKUtil.joinZNode(this.myQueuesZnode, queueId));
91      } catch (KeeperException e) {
92        this.abortable.abort("Failed to delete queue (queueId=" + queueId + ")", e);
93      }
94    }
95  
96    @Override
97    public void addLog(String queueId, String filename) throws ReplicationException {
98      String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
99      znode = ZKUtil.joinZNode(znode, filename);
100     try {
101       ZKUtil.createWithParents(this.zookeeper, znode);
102     } catch (KeeperException e) {
103       throw new ReplicationException(
104           "Could not add log because znode could not be created. queueId=" + queueId
105               + ", filename=" + filename);
106     }
107   }
108 
109   @Override
110   public void removeLog(String queueId, String filename) {
111     try {
112       String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
113       znode = ZKUtil.joinZNode(znode, filename);
114       ZKUtil.deleteNode(this.zookeeper, znode);
115     } catch (KeeperException e) {
116       this.abortable.abort("Failed to remove hlog from queue (queueId=" + queueId + ", filename="
117           + filename + ")", e);
118     }
119   }
120 
121   @Override
122   public void setLogPosition(String queueId, String filename, long position) {
123     try {
124       String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
125       znode = ZKUtil.joinZNode(znode, filename);
126       // Why serialize String of Long and not Long as bytes?
127       ZKUtil.setData(this.zookeeper, znode, ZKUtil.positionToByteArray(position));
128     } catch (KeeperException e) {
129       this.abortable.abort("Failed to write replication hlog position (filename=" + filename
130           + ", position=" + position + ")", e);
131     }
132   }
133 
134   @Override
135   public long getLogPosition(String queueId, String filename) throws ReplicationException {
136     String clusterZnode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
137     String znode = ZKUtil.joinZNode(clusterZnode, filename);
138     byte[] bytes = null;
139     try {
140       bytes = ZKUtil.getData(this.zookeeper, znode);
141     } catch (KeeperException e) {
142       throw new ReplicationException("Internal Error: could not get position in log for queueId="
143           + queueId + ", filename=" + filename, e);
144     }
145     try {
146       return ZKUtil.parseHLogPositionFrom(bytes);
147     } catch (DeserializationException de) {
148       LOG.warn("Failed to parse HLogPosition for queueId=" + queueId + " and hlog=" + filename
149           + "znode content, continuing.");
150     }
151     // if we can not parse the position, start at the beginning of the hlog file
152     // again
153     return 0;
154   }
155 
156   @Override
157   public SortedMap<String, SortedSet<String>> claimQueues(String regionserverZnode) {
158     SortedMap<String, SortedSet<String>> newQueues = new TreeMap<String, SortedSet<String>>();
159     if (ZKUtil.joinZNode(this.queuesZNode, regionserverZnode).equals(this.myQueuesZnode)) {
160       LOG.warn("An attempt was made to claim our own queues on region server " + regionserverZnode);
161       return newQueues;
162     }
163     // check whether there is multi support. If yes, use it.
164     if (conf.getBoolean(HConstants.ZOOKEEPER_USEMULTI, true)) {
165       LOG.info("Atomically moving " + regionserverZnode + "'s hlogs to my queue");
166       newQueues = copyQueuesFromRSUsingMulti(regionserverZnode);
167     } else {
168       LOG.info("Moving " + regionserverZnode + "'s hlogs to my queue");
169       if (!lockOtherRS(regionserverZnode)) {
170         return newQueues;
171       }
172       newQueues = copyQueuesFromRS(regionserverZnode);
173       deleteAnotherRSQueues(regionserverZnode);
174     }
175     return newQueues;
176   }
177 
178   @Override
179   public void removeAllQueues() {
180     try {
181       ZKUtil.deleteNodeRecursively(this.zookeeper, this.myQueuesZnode);
182     } catch (KeeperException e) {
183       // if the znode is already expired, don't bother going further
184       if (e instanceof KeeperException.SessionExpiredException) {
185         return;
186       }
187       this.abortable.abort("Failed to delete replication queues for region server: "
188           + this.myQueuesZnode, e);
189     }
190   }
191 
192   @Override
193   public List<String> getLogsInQueue(String queueId) {
194     String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
195     List<String> result = null;
196     try {
197       result = ZKUtil.listChildrenNoWatch(this.zookeeper, znode);
198     } catch (KeeperException e) {
199       this.abortable.abort("Failed to get list of hlogs for queueId=" + queueId, e);
200     }
201     return result;
202   }
203 
204   @Override
205   public List<String> getAllQueues() {
206     List<String> listOfQueues = null;
207     try {
208       listOfQueues = ZKUtil.listChildrenNoWatch(this.zookeeper, this.myQueuesZnode);
209     } catch (KeeperException e) {
210       this.abortable.abort("Failed to get a list of queues for region server: "
211           + this.myQueuesZnode, e);
212     }
213     return listOfQueues;
214   }
215 
216   /**
217    * Try to set a lock in another region server's znode.
218    * @param znode the server names of the other server
219    * @return true if the lock was acquired, false in every other cases
220    */
221   private boolean lockOtherRS(String znode) {
222     try {
223       String parent = ZKUtil.joinZNode(this.queuesZNode, znode);
224       if (parent.equals(this.myQueuesZnode)) {
225         LOG.warn("Won't lock because this is us, we're dead!");
226         return false;
227       }
228       String p = ZKUtil.joinZNode(parent, RS_LOCK_ZNODE);
229       ZKUtil.createAndWatch(this.zookeeper, p, lockToByteArray(this.myQueuesZnode));
230     } catch (KeeperException e) {
231       // This exception will pop up if the znode under which we're trying to
232       // create the lock is already deleted by another region server, meaning
233       // that the transfer already occurred.
234       // NoNode => transfer is done and znodes are already deleted
235       // NodeExists => lock znode already created by another RS
236       if (e instanceof KeeperException.NoNodeException
237           || e instanceof KeeperException.NodeExistsException) {
238         LOG.info("Won't transfer the queue," + " another RS took care of it because of: "
239             + e.getMessage());
240       } else {
241         LOG.info("Failed lock other rs", e);
242       }
243       return false;
244     }
245     return true;
246   }
247 
248   /**
249    * Delete all the replication queues for a given region server.
250    * @param regionserverZnode The znode of the region server to delete.
251    */
252   private void deleteAnotherRSQueues(String regionserverZnode) {
253     String fullpath = ZKUtil.joinZNode(this.queuesZNode, regionserverZnode);
254     try {
255       List<String> clusters = ZKUtil.listChildrenNoWatch(this.zookeeper, fullpath);
256       for (String cluster : clusters) {
257         // No need to delete, it will be deleted later.
258         if (cluster.equals(RS_LOCK_ZNODE)) {
259           continue;
260         }
261         String fullClusterPath = ZKUtil.joinZNode(fullpath, cluster);
262         ZKUtil.deleteNodeRecursively(this.zookeeper, fullClusterPath);
263       }
264       // Finish cleaning up
265       ZKUtil.deleteNodeRecursively(this.zookeeper, fullpath);
266     } catch (KeeperException e) {
267       if (e instanceof KeeperException.NoNodeException
268           || e instanceof KeeperException.NotEmptyException) {
269         // Testing a special case where another region server was able to
270         // create a lock just after we deleted it, but then was also able to
271         // delete the RS znode before us or its lock znode is still there.
272         if (e.getPath().equals(fullpath)) {
273           return;
274         }
275       }
276       this.abortable.abort("Failed to delete replication queues for region server: "
277           + regionserverZnode, e);
278     }
279   }
280 
281   /**
282    * It "atomically" copies all the hlogs queues from another region server and returns them all
283    * sorted per peer cluster (appended with the dead server's znode).
284    * @param znode pertaining to the region server to copy the queues from
285    * @return HLog queues sorted per peer cluster
286    */
287   private SortedMap<String, SortedSet<String>> copyQueuesFromRSUsingMulti(String znode) {
288     SortedMap<String, SortedSet<String>> queues = new TreeMap<String, SortedSet<String>>();
289     // hbase/replication/rs/deadrs
290     String deadRSZnodePath = ZKUtil.joinZNode(this.queuesZNode, znode);
291     List<String> peerIdsToProcess = null;
292     List<ZKUtilOp> listOfOps = new ArrayList<ZKUtil.ZKUtilOp>();
293     try {
294       peerIdsToProcess = ZKUtil.listChildrenNoWatch(this.zookeeper, deadRSZnodePath);
295       if (peerIdsToProcess == null) return queues; // node already processed
296       for (String peerId : peerIdsToProcess) {
297         ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(peerId);
298         if (!peerExists(replicationQueueInfo.getPeerId())) {
299           LOG.warn("Peer " + peerId + " didn't exist, skipping the replay");
300           // Protection against moving orphaned queues
301           continue;
302         }
303         String newPeerId = peerId + "-" + znode;
304         String newPeerZnode = ZKUtil.joinZNode(this.myQueuesZnode, newPeerId);
305         // check the logs queue for the old peer cluster
306         String oldClusterZnode = ZKUtil.joinZNode(deadRSZnodePath, peerId);
307         List<String> hlogs = ZKUtil.listChildrenNoWatch(this.zookeeper, oldClusterZnode);
308         if (hlogs == null || hlogs.size() == 0) {
309           listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
310           continue; // empty log queue.
311         }
312         // create the new cluster znode
313         SortedSet<String> logQueue = new TreeSet<String>();
314         queues.put(newPeerId, logQueue);
315         ZKUtilOp op = ZKUtilOp.createAndFailSilent(newPeerZnode, HConstants.EMPTY_BYTE_ARRAY);
316         listOfOps.add(op);
317         // get the offset of the logs and set it to new znodes
318         for (String hlog : hlogs) {
319           String oldHlogZnode = ZKUtil.joinZNode(oldClusterZnode, hlog);
320           byte[] logOffset = ZKUtil.getData(this.zookeeper, oldHlogZnode);
321           LOG.debug("Creating " + hlog + " with data " + Bytes.toString(logOffset));
322           String newLogZnode = ZKUtil.joinZNode(newPeerZnode, hlog);
323           listOfOps.add(ZKUtilOp.createAndFailSilent(newLogZnode, logOffset));
324           // add ops for deleting
325           listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldHlogZnode));
326           logQueue.add(hlog);
327         }
328         // add delete op for peer
329         listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
330       }
331       // add delete op for dead rs
332       listOfOps.add(ZKUtilOp.deleteNodeFailSilent(deadRSZnodePath));
333       LOG.debug(" The multi list size is: " + listOfOps.size());
334       ZKUtil.multiOrSequential(this.zookeeper, listOfOps, false);
335       LOG.info("Atomically moved the dead regionserver logs. ");
336     } catch (KeeperException e) {
337       // Multi call failed; it looks like some other regionserver took away the logs.
338       LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
339       queues.clear();
340     }
341     return queues;
342   }
343 
344   /**
345    * This methods copies all the hlogs queues from another region server and returns them all sorted
346    * per peer cluster (appended with the dead server's znode)
347    * @param znode server names to copy
348    * @return all hlogs for all peers of that cluster, null if an error occurred
349    */
350   private SortedMap<String, SortedSet<String>> copyQueuesFromRS(String znode) {
351     // TODO this method isn't atomic enough, we could start copying and then
352     // TODO fail for some reason and we would end up with znodes we don't want.
353     SortedMap<String, SortedSet<String>> queues = new TreeMap<String, SortedSet<String>>();
354     try {
355       String nodePath = ZKUtil.joinZNode(this.queuesZNode, znode);
356       List<String> clusters = ZKUtil.listChildrenNoWatch(this.zookeeper, nodePath);
357       // We have a lock znode in there, it will count as one.
358       if (clusters == null || clusters.size() <= 1) {
359         return queues;
360       }
361       // The lock isn't a peer cluster, remove it
362       clusters.remove(RS_LOCK_ZNODE);
363       for (String cluster : clusters) {
364         ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(cluster);
365         if (!peerExists(replicationQueueInfo.getPeerId())) {
366           LOG.warn("Peer " + cluster + " didn't exist, skipping the replay");
367           // Protection against moving orphaned queues
368           continue;
369         }
370         // We add the name of the recovered RS to the new znode, we can even
371         // do that for queues that were recovered 10 times giving a znode like
372         // number-startcode-number-otherstartcode-number-anotherstartcode-etc
373         String newCluster = cluster + "-" + znode;
374         String newClusterZnode = ZKUtil.joinZNode(this.myQueuesZnode, newCluster);
375         String clusterPath = ZKUtil.joinZNode(nodePath, cluster);
376         List<String> hlogs = ZKUtil.listChildrenNoWatch(this.zookeeper, clusterPath);
377         // That region server didn't have anything to replicate for this cluster
378         if (hlogs == null || hlogs.size() == 0) {
379           continue;
380         }
381         ZKUtil.createNodeIfNotExistsAndWatch(this.zookeeper, newClusterZnode,
382           HConstants.EMPTY_BYTE_ARRAY);
383         SortedSet<String> logQueue = new TreeSet<String>();
384         queues.put(newCluster, logQueue);
385         for (String hlog : hlogs) {
386           String z = ZKUtil.joinZNode(clusterPath, hlog);
387           byte[] positionBytes = ZKUtil.getData(this.zookeeper, z);
388           long position = 0;
389           try {
390             position = ZKUtil.parseHLogPositionFrom(positionBytes);
391           } catch (DeserializationException e) {
392             LOG.warn("Failed parse of hlog position from the following znode: " + z
393                 + ", Exception: " + e);
394           }
395           LOG.debug("Creating " + hlog + " with data " + position);
396           String child = ZKUtil.joinZNode(newClusterZnode, hlog);
397           // Position doesn't actually change, we are just deserializing it for
398           // logging, so just use the already serialized version
399           ZKUtil.createAndWatch(this.zookeeper, child, positionBytes);
400           logQueue.add(hlog);
401         }
402       }
403     } catch (KeeperException e) {
404       this.abortable.abort("Copy queues from rs", e);
405     }
406     return queues;
407   }
408 
409   /**
410    * @param lockOwner
411    * @return Serialized protobuf of <code>lockOwner</code> with pb magic prefix prepended suitable
412    *         for use as content of an replication lock during region server fail over.
413    */
414   static byte[] lockToByteArray(final String lockOwner) {
415     byte[] bytes =
416         ZooKeeperProtos.ReplicationLock.newBuilder().setLockOwner(lockOwner).build().toByteArray();
417     return ProtobufUtil.prependPBMagic(bytes);
418   }
419 }