View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.replication.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.net.ConnectException;
25  import java.net.SocketTimeoutException;
26  import java.util.ArrayList;
27  import java.util.Arrays;
28  import java.util.Collections;
29  import java.util.Comparator;
30  import java.util.HashSet;
31  import java.util.List;
32  import java.util.NavigableMap;
33  import java.util.Random;
34  import java.util.Set;
35  import java.util.UUID;
36  import java.util.concurrent.CountDownLatch;
37  import java.util.concurrent.PriorityBlockingQueue;
38  import java.util.concurrent.TimeUnit;
39  import java.util.concurrent.atomic.AtomicBoolean;
40  
41  import org.apache.commons.logging.Log;
42  import org.apache.commons.logging.LogFactory;
43  import org.apache.hadoop.classification.InterfaceAudience;
44  import org.apache.hadoop.conf.Configuration;
45  import org.apache.hadoop.fs.FileStatus;
46  import org.apache.hadoop.fs.FileSystem;
47  import org.apache.hadoop.fs.Path;
48  import org.apache.hadoop.hbase.HConstants;
49  import org.apache.hadoop.hbase.KeyValue;
50  import org.apache.hadoop.hbase.ServerName;
51  import org.apache.hadoop.hbase.Stoppable;
52  import org.apache.hadoop.hbase.client.AdminProtocol;
53  import org.apache.hadoop.hbase.client.HConnection;
54  import org.apache.hadoop.hbase.client.HConnectionManager;
55  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
56  import org.apache.hadoop.hbase.protobuf.ReplicationProtbufUtil;
57  import org.apache.hadoop.hbase.regionserver.wal.HLog;
58  import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
59  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
60  import org.apache.hadoop.hbase.replication.ReplicationZookeeper;
61  import org.apache.hadoop.hbase.util.Bytes;
62  import org.apache.hadoop.hbase.util.Threads;
63  import org.apache.hadoop.ipc.RemoteException;
64  import org.apache.zookeeper.KeeperException;
65  
66  /**
67   * Class that handles the source of a replication stream.
68   * Currently does not handle more than 1 slave
69   * For each slave cluster it selects a random number of peers
70   * using a replication ratio. For example, if replication ration = 0.1
71   * and slave cluster has 100 region servers, 10 will be selected.
72   * <p/>
73   * A stream is considered down when we cannot contact a region server on the
74   * peer cluster for more than 55 seconds by default.
75   * <p/>
76   *
77   */
78  @InterfaceAudience.Private
79  public class ReplicationSource extends Thread
80      implements ReplicationSourceInterface {
81  
82    private static final Log LOG = LogFactory.getLog(ReplicationSource.class);
83    // Queue of logs to process
84    private PriorityBlockingQueue<Path> queue;
85    // container of entries to replicate
86    private HLog.Entry[] entriesArray;
87    private HConnection conn;
88    // Helper class for zookeeper
89    private ReplicationZookeeper zkHelper;
90    private Configuration conf;
91    // ratio of region servers to chose from a slave cluster
92    private float ratio;
93    private Random random;
94    // should we replicate or not?
95    private AtomicBoolean replicating;
96    // id of the peer cluster this source replicates to
97    private String peerId;
98    // The manager of all sources to which we ping back our progress
99    private ReplicationSourceManager manager;
100   // Should we stop everything?
101   private Stoppable stopper;
102   // List of chosen sinks (region servers)
103   private List<ServerName> currentPeers;
104   // How long should we sleep for each retry
105   private long sleepForRetries;
106   // Max size in bytes of entriesArray
107   private long replicationQueueSizeCapacity;
108   // Max number of entries in entriesArray
109   private int replicationQueueNbCapacity;
110   // Our reader for the current log
111   private HLog.Reader reader;
112   // Last position in the log that we sent to ZooKeeper
113   private long lastLoggedPosition = -1;
114   // Path of the current log
115   private volatile Path currentPath;
116   private FileSystem fs;
117   // id of this cluster
118   private UUID clusterId;
119   // id of the other cluster
120   private UUID peerClusterId;
121   // total number of edits we replicated
122   private long totalReplicatedEdits = 0;
123   // The znode we currently play with
124   private String peerClusterZnode;
125   // Indicates if this queue is recovered (and will be deleted when depleted)
126   private boolean queueRecovered;
127   // List of all the dead region servers that had this queue (if recovered)
128   private List<String> deadRegionServers = new ArrayList<String>();
129   // Maximum number of retries before taking bold actions
130   private int maxRetriesMultiplier;
131   // Socket timeouts require even bolder actions since we don't want to DDOS
132   private int socketTimeoutMultiplier;
133   // Current number of entries that we need to replicate
134   private int currentNbEntries = 0;
135   // Current number of operations (Put/Delete) that we need to replicate
136   private int currentNbOperations = 0;
137   // Current size of data we need to replicate
138   private int currentSize = 0;
139   // Indicates if this particular source is running
140   private volatile boolean running = true;
141   // Metrics for this source
142   private MetricsSource metrics;
143   // Handle on the log reader helper
144   private ReplicationHLogReaderManager repLogReader;
145 
146   /**
147    * Instantiation method used by region servers
148    *
149    * @param conf configuration to use
150    * @param fs file system to use
151    * @param manager replication manager to ping to
152    * @param stopper     the atomic boolean to use to stop the regionserver
153    * @param replicating the atomic boolean that starts/stops replication
154    * @param peerClusterZnode the name of our znode
155    * @throws IOException
156    */
157   public void init(final Configuration conf,
158                    final FileSystem fs,
159                    final ReplicationSourceManager manager,
160                    final Stoppable stopper,
161                    final AtomicBoolean replicating,
162                    final String peerClusterZnode)
163       throws IOException {
164     this.stopper = stopper;
165     this.conf = conf;
166     this.replicationQueueSizeCapacity =
167         this.conf.getLong("replication.source.size.capacity", 1024*1024*64);
168     this.replicationQueueNbCapacity =
169         this.conf.getInt("replication.source.nb.capacity", 25000);
170     this.entriesArray = new HLog.Entry[this.replicationQueueNbCapacity];
171     for (int i = 0; i < this.replicationQueueNbCapacity; i++) {
172       this.entriesArray[i] = new HLog.Entry();
173     }
174     this.maxRetriesMultiplier =
175         this.conf.getInt("replication.source.maxretriesmultiplier", 10);
176     this.socketTimeoutMultiplier = maxRetriesMultiplier * maxRetriesMultiplier;
177     this.queue =
178         new PriorityBlockingQueue<Path>(
179             conf.getInt("hbase.regionserver.maxlogs", 32),
180             new LogsComparator());
181     this.conn = HConnectionManager.getConnection(conf);
182     this.zkHelper = manager.getRepZkWrapper();
183     this.ratio = this.conf.getFloat("replication.source.ratio", 0.1f);
184     this.currentPeers = new ArrayList<ServerName>();
185     this.random = new Random();
186     this.replicating = replicating;
187     this.manager = manager;
188     this.sleepForRetries =
189         this.conf.getLong("replication.source.sleepforretries", 1000);
190     this.fs = fs;
191     this.metrics = new MetricsSource(peerClusterZnode);
192     this.repLogReader = new ReplicationHLogReaderManager(this.fs, this.conf);
193     try {
194       this.clusterId = zkHelper.getUUIDForCluster(zkHelper.getZookeeperWatcher());
195     } catch (KeeperException ke) {
196       throw new IOException("Could not read cluster id", ke);
197     }
198 
199     // Finally look if this is a recovered queue
200     this.checkIfQueueRecovered(peerClusterZnode);
201   }
202 
203   // The passed znode will be either the id of the peer cluster or
204   // the handling story of that queue in the form of id-servername-*
205   //
206   // package access for testing
207   void checkIfQueueRecovered(String peerClusterZnode) {
208     String[] parts = peerClusterZnode.split("-", 2);
209     this.queueRecovered = parts.length != 1;
210     this.peerId = this.queueRecovered ?
211         parts[0] : peerClusterZnode;
212     this.peerClusterZnode = peerClusterZnode;
213 
214     if (parts.length < 2) {
215       // not queue recovered situation
216       return;
217     }
218 
219     // extract dead servers
220     extractDeadServersFromZNodeString(parts[1], this.deadRegionServers);
221   }
222   
223   /**
224    * for tests only
225    */
226   List<String> getDeadRegionServers() {
227     return Collections.unmodifiableList(this.deadRegionServers);
228   }
229 
230   /**
231    * Parse dead server names from znode string servername can contain "-" such as
232    * "ip-10-46-221-101.ec2.internal", so we need skip some "-" during parsing for the following
233    * cases: 2-ip-10-46-221-101.ec2.internal,52170,1364333181125-<server name>-...
234    */
235   private static void
236       extractDeadServersFromZNodeString(String deadServerListStr, List<String> result) {
237     
238     if(deadServerListStr == null || result == null || deadServerListStr.isEmpty()) return;
239 
240     // valid server name delimiter "-" has to be after "," in a server name
241     int seenCommaCnt = 0;
242     int startIndex = 0;
243     int len = deadServerListStr.length();
244 
245     for (int i = 0; i < len; i++) {
246       switch (deadServerListStr.charAt(i)) {
247       case ',':
248         seenCommaCnt += 1;
249         break;
250       case '-':
251         if(seenCommaCnt>=2) {
252           if (i > startIndex) {
253             String serverName = deadServerListStr.substring(startIndex, i);
254             if(ServerName.isFullServerName(serverName)){
255               result.add(serverName);
256             } else {
257               LOG.error("Found invalid server name:" + serverName);
258             }
259             startIndex = i + 1;
260           }
261           seenCommaCnt = 0;
262         }
263         break;
264       default:
265         break;
266       }
267     }
268 
269     // add tail
270     if(startIndex < len - 1){
271       String serverName = deadServerListStr.substring(startIndex, len);
272       if(ServerName.isFullServerName(serverName)){
273         result.add(serverName);
274       } else {
275         LOG.error("Found invalid server name at the end:" + serverName);
276       }
277     }
278 
279     LOG.debug("Found dead servers:" + result);
280   }
281   
282   /**
283    * Select a number of peers at random using the ratio. Mininum 1.
284    */
285   private void chooseSinks() {
286     this.currentPeers.clear();
287     List<ServerName> addresses = this.zkHelper.getSlavesAddresses(peerId);
288     Set<ServerName> setOfAddr = new HashSet<ServerName>();
289     int nbPeers = (int) (Math.ceil(addresses.size() * ratio));
290     LOG.info("Getting " + nbPeers +
291         " rs from peer cluster # " + peerId);
292     for (int i = 0; i < nbPeers; i++) {
293       ServerName sn;
294       // Make sure we get one address that we don't already have
295       do {
296         sn = addresses.get(this.random.nextInt(addresses.size()));
297       } while (setOfAddr.contains(sn));
298       LOG.info("Choosing peer " + sn);
299       setOfAddr.add(sn);
300     }
301     this.currentPeers.addAll(setOfAddr);
302   }
303 
304   @Override
305   public void enqueueLog(Path log) {
306     this.queue.put(log);
307     this.metrics.setSizeOfLogQueue(queue.size());
308   }
309 
310   @Override
311   public void run() {
312     connectToPeers();
313     // We were stopped while looping to connect to sinks, just abort
314     if (!this.isActive()) {
315       metrics.clear();
316       return;
317     }
318     int sleepMultiplier = 1;
319     // delay this until we are in an asynchronous thread
320     while (this.peerClusterId == null) {
321       this.peerClusterId = zkHelper.getPeerUUID(this.peerId);
322       if (this.peerClusterId == null) {
323         if (sleepForRetries("Cannot contact the peer's zk ensemble", sleepMultiplier)) {
324           sleepMultiplier++;
325         }
326       }
327     }
328     // resetting to 1 to reuse later
329     sleepMultiplier = 1;
330 
331     LOG.info("Replicating "+clusterId + " -> " + peerClusterId);
332 
333     // If this is recovered, the queue is already full and the first log
334     // normally has a position (unless the RS failed between 2 logs)
335     if (this.queueRecovered) {
336       try {
337         this.repLogReader.setPosition(this.zkHelper.getHLogRepPosition(
338             this.peerClusterZnode, this.queue.peek().getName()));
339       } catch (KeeperException e) {
340         this.terminate("Couldn't get the position of this recovered queue " +
341             peerClusterZnode, e);
342       }
343     }
344     // Loop until we close down
345     while (isActive()) {
346       // Sleep until replication is enabled again
347       if (!isPeerEnabled()) {
348         if (sleepForRetries("Replication is disabled", sleepMultiplier)) {
349           sleepMultiplier++;
350         }
351         continue;
352       }
353       Path oldPath = getCurrentPath(); //note that in the current scenario,
354                                        //oldPath will be null when a log roll
355                                        //happens.
356       // Get a new path
357       boolean hasCurrentPath = getNextPath();
358       if (getCurrentPath() != null && oldPath == null) {
359         sleepMultiplier = 1; //reset the sleepMultiplier on a path change
360       }
361       if (!hasCurrentPath) {
362         if (sleepForRetries("No log to process", sleepMultiplier)) {
363           sleepMultiplier++;
364         }
365         continue;
366       }
367       boolean currentWALisBeingWrittenTo = false;
368       //For WAL files we own (rather than recovered), take a snapshot of whether the
369       //current WAL file (this.currentPath) is in use (for writing) NOW!
370       //Since the new WAL paths are enqueued only after the prev WAL file
371       //is 'closed', presence of an element in the queue means that
372       //the previous WAL file was closed, else the file is in use (currentPath)
373       //We take the snapshot now so that we are protected against races
374       //where a new file gets enqueued while the current file is being processed
375       //(and where we just finished reading the current file).
376       if (!this.queueRecovered && queue.size() == 0) {
377         currentWALisBeingWrittenTo = true;
378       }
379       // Open a reader on it
380       if (!openReader(sleepMultiplier)) {
381         // Reset the sleep multiplier, else it'd be reused for the next file
382         sleepMultiplier = 1;
383         continue;
384       }
385 
386       // If we got a null reader but didn't continue, then sleep and continue
387       if (this.reader == null) {
388         if (sleepForRetries("Unable to open a reader", sleepMultiplier)) {
389           sleepMultiplier++;
390         }
391         continue;
392       }
393 
394       boolean gotIOE = false;
395       currentNbOperations = 0;
396       currentNbEntries = 0;
397       currentSize = 0;
398       try {
399         if (readAllEntriesToReplicateOrNextFile(currentWALisBeingWrittenTo)) {
400           continue;
401         }
402       } catch (IOException ioe) {
403         LOG.warn(peerClusterZnode + " Got: ", ioe);
404         gotIOE = true;
405         if (ioe.getCause() instanceof EOFException) {
406 
407           boolean considerDumping = false;
408           if (this.queueRecovered) {
409             try {
410               FileStatus stat = this.fs.getFileStatus(this.currentPath);
411               if (stat.getLen() == 0) {
412                 LOG.warn(peerClusterZnode + " Got EOF and the file was empty");
413               }
414               considerDumping = true;
415             } catch (IOException e) {
416               LOG.warn(peerClusterZnode + " Got while getting file size: ", e);
417             }
418           } else if (currentNbEntries != 0) {
419             LOG.warn(peerClusterZnode + " Got EOF while reading, " +
420                 "looks like this file is broken? " + currentPath);
421             considerDumping = true;
422             currentNbEntries = 0;
423           }
424 
425           if (considerDumping &&
426               sleepMultiplier == this.maxRetriesMultiplier &&
427               processEndOfFile()) {
428             continue;
429           }
430         }
431       } finally {
432         try {
433           this.reader = null;
434           this.repLogReader.closeReader();
435         } catch (IOException e) {
436           gotIOE = true;
437           LOG.warn("Unable to finalize the tailing of a file", e);
438         }
439       }
440 
441       // If we didn't get anything to replicate, or if we hit a IOE,
442       // wait a bit and retry.
443       // But if we need to stop, don't bother sleeping
444       if (this.isActive() && (gotIOE || currentNbEntries == 0)) {
445         if (this.lastLoggedPosition != this.repLogReader.getPosition()) {
446           this.manager.logPositionAndCleanOldLogs(this.currentPath,
447               this.peerClusterZnode, this.repLogReader.getPosition(),
448               queueRecovered, currentWALisBeingWrittenTo);
449           this.lastLoggedPosition = this.repLogReader.getPosition();
450         }
451         if (sleepForRetries("Nothing to replicate", sleepMultiplier)) {
452           sleepMultiplier++;
453         }
454         continue;
455       }
456       sleepMultiplier = 1;
457       shipEdits(currentWALisBeingWrittenTo);
458 
459     }
460     if (this.conn != null) {
461       try {
462         this.conn.close();
463       } catch (IOException e) {
464         LOG.debug("Attempt to close connection failed", e);
465       }
466     }
467     LOG.debug("Source exiting " + peerId);
468     metrics.clear();
469   }
470 
471   /**
472    * Read all the entries from the current log files and retain those
473    * that need to be replicated. Else, process the end of the current file.
474    * @param currentWALisBeingWrittenTo is the current WAL being written to
475    * @return true if we got nothing and went to the next file, false if we got
476    * entries
477    * @throws IOException
478    */
479   protected boolean readAllEntriesToReplicateOrNextFile(boolean currentWALisBeingWrittenTo)
480       throws IOException{
481     long seenEntries = 0;
482     this.repLogReader.seek();
483     HLog.Entry entry =
484         this.repLogReader.readNextAndSetPosition(this.entriesArray, this.currentNbEntries);
485     while (entry != null) {
486       WALEdit edit = entry.getEdit();
487       this.metrics.incrLogEditsRead();
488       seenEntries++;
489       // Remove all KVs that should not be replicated
490       HLogKey logKey = entry.getKey();
491       // don't replicate if the log entries originated in the peer
492       if (!logKey.getClusterId().equals(peerClusterId)) {
493         removeNonReplicableEdits(edit);
494         // Don't replicate catalog entries, if the WALEdit wasn't
495         // containing anything to replicate and if we're currently not set to replicate
496         if (!(Bytes.equals(logKey.getTablename(), HConstants.ROOT_TABLE_NAME) ||
497             Bytes.equals(logKey.getTablename(), HConstants.META_TABLE_NAME)) &&
498             edit.size() != 0 && replicating.get()) {
499           // Only set the clusterId if is a local key.
500           // This ensures that the originator sets the cluster id
501           // and all replicas retain the initial cluster id.
502           // This is *only* place where a cluster id other than the default is set.
503           if (HConstants.DEFAULT_CLUSTER_ID == logKey.getClusterId()) {
504             logKey.setClusterId(this.clusterId);
505           }
506           currentNbOperations += countDistinctRowKeys(edit);
507           currentNbEntries++;
508           currentSize += entry.getEdit().size();
509         } else {
510           this.metrics.incrLogEditsFiltered();
511         }
512       }
513       // Stop if too many entries or too big
514       if (currentSize >= this.replicationQueueSizeCapacity ||
515           currentNbEntries >= this.replicationQueueNbCapacity) {
516         break;
517       }
518       try {
519         entry = this.repLogReader.readNextAndSetPosition(this.entriesArray, this.currentNbEntries);
520       } catch (IOException ie) {
521         LOG.debug("Break on IOE: " + ie.getMessage());
522         break;
523       }
524     }
525     if (currentWALisBeingWrittenTo) {
526       return false;
527     }
528     // If we didn't get anything and the queue has an object, it means we
529     // hit the end of the file for sure
530     return seenEntries == 0 && processEndOfFile();
531   }
532 
533   private void connectToPeers() {
534     // Connect to peer cluster first, unless we have to stop
535     while (this.isActive() && this.currentPeers.size() == 0) {
536 
537       try {
538         chooseSinks();
539         Thread.sleep(this.sleepForRetries);
540       } catch (InterruptedException e) {
541         LOG.error("Interrupted while trying to connect to sinks", e);
542       }
543     }
544   }
545 
546   /**
547    * Poll for the next path
548    * @return true if a path was obtained, false if not
549    */
550   protected boolean getNextPath() {
551     try {
552       if (this.currentPath == null) {
553         this.currentPath = queue.poll(this.sleepForRetries, TimeUnit.MILLISECONDS);
554         this.metrics.setSizeOfLogQueue(queue.size());
555       }
556     } catch (InterruptedException e) {
557       LOG.warn("Interrupted while reading edits", e);
558     }
559     return this.currentPath != null;
560   }
561 
562   /**
563    * Open a reader on the current path
564    *
565    * @param sleepMultiplier by how many times the default sleeping time is augmented
566    * @return true if we should continue with that file, false if we are over with it
567    */
568   protected boolean openReader(int sleepMultiplier) {
569     try {
570       try {
571         this.reader = repLogReader.openReader(this.currentPath);
572       } catch (FileNotFoundException fnfe) {
573         if (this.queueRecovered) {
574           // We didn't find the log in the archive directory, look if it still
575           // exists in the dead RS folder (there could be a chain of failures
576           // to look at)
577           LOG.info("NB dead servers : " + deadRegionServers.size());
578           for (String curDeadServerName : deadRegionServers) {
579             Path deadRsDirectory =
580                 new Path(manager.getLogDir().getParent(), curDeadServerName);
581             Path[] locs = new Path[] {
582                 new Path(deadRsDirectory, currentPath.getName()),
583                 new Path(deadRsDirectory.suffix(HLog.SPLITTING_EXT),
584                                           currentPath.getName()),
585             };
586             for (Path possibleLogLocation : locs) {
587               LOG.info("Possible location " + possibleLogLocation.toUri().toString());
588               if (this.manager.getFs().exists(possibleLogLocation)) {
589                 // We found the right new location
590                 LOG.info("Log " + this.currentPath + " still exists at " +
591                     possibleLogLocation);
592                 // Breaking here will make us sleep since reader is null
593                 return true;
594               }
595             }
596           }
597           // TODO What happens if the log was missing from every single location?
598           // Although we need to check a couple of times as the log could have
599           // been moved by the master between the checks
600           // It can also happen if a recovered queue wasn't properly cleaned,
601           // such that the znode pointing to a log exists but the log was
602           // deleted a long time ago.
603           // For the moment, we'll throw the IO and processEndOfFile
604           throw new IOException("File from recovered queue is " +
605               "nowhere to be found", fnfe);
606         } else {
607           // If the log was archived, continue reading from there
608           Path archivedLogLocation =
609               new Path(manager.getOldLogDir(), currentPath.getName());
610           if (this.manager.getFs().exists(archivedLogLocation)) {
611             currentPath = archivedLogLocation;
612             LOG.info("Log " + this.currentPath + " was moved to " +
613                 archivedLogLocation);
614             // Open the log at the new location
615             this.openReader(sleepMultiplier);
616 
617           }
618           // TODO What happens the log is missing in both places?
619         }
620       }
621     } catch (IOException ioe) {
622       LOG.warn(peerClusterZnode + " Got: ", ioe);
623       this.reader = null;
624       // TODO Need a better way to determinate if a file is really gone but
625       // TODO without scanning all logs dir
626       if (sleepMultiplier == this.maxRetriesMultiplier) {
627         LOG.warn("Waited too long for this file, considering dumping");
628         return !processEndOfFile();
629       }
630     }
631     return true;
632   }
633 
634   /**
635    * Do the sleeping logic
636    * @param msg Why we sleep
637    * @param sleepMultiplier by how many times the default sleeping time is augmented
638    * @return True if <code>sleepMultiplier</code> is &lt; <code>maxRetriesMultiplier</code>
639    */
640   protected boolean sleepForRetries(String msg, int sleepMultiplier) {
641     try {
642       LOG.debug(msg + ", sleeping " + sleepForRetries + " times " + sleepMultiplier);
643       Thread.sleep(this.sleepForRetries * sleepMultiplier);
644     } catch (InterruptedException e) {
645       LOG.debug("Interrupted while sleeping between retries");
646     }
647     return sleepMultiplier < maxRetriesMultiplier;
648   }
649 
650   /**
651    * We only want KVs that are scoped other than local
652    * @param edit The KV to check for replication
653    */
654   protected void removeNonReplicableEdits(WALEdit edit) {
655     NavigableMap<byte[], Integer> scopes = edit.getScopes();
656     List<KeyValue> kvs = edit.getKeyValues();
657     for (int i = edit.size()-1; i >= 0; i--) {
658       KeyValue kv = kvs.get(i);
659       // The scope will be null or empty if
660       // there's nothing to replicate in that WALEdit
661       if (scopes == null || !scopes.containsKey(kv.getFamily())) {
662         kvs.remove(i);
663       }
664     }
665   }
666 
667   /**
668    * Count the number of different row keys in the given edit because of
669    * mini-batching. We assume that there's at least one KV in the WALEdit.
670    * @param edit edit to count row keys from
671    * @return number of different row keys
672    */
673   private int countDistinctRowKeys(WALEdit edit) {
674     List<KeyValue> kvs = edit.getKeyValues();
675     int distinctRowKeys = 1;
676     KeyValue lastKV = kvs.get(0);
677     for (int i = 0; i < edit.size(); i++) {
678       if (!kvs.get(i).matchingRow(lastKV)) {
679         distinctRowKeys++;
680       }
681     }
682     return distinctRowKeys;
683   }
684 
685   /**
686    * Do the shipping logic
687    * @param currentWALisBeingWrittenTo was the current WAL being (seemingly) 
688    * written to when this method was called
689    */
690   protected void shipEdits(boolean currentWALisBeingWrittenTo) {
691     int sleepMultiplier = 1;
692     if (this.currentNbEntries == 0) {
693       LOG.warn("Was given 0 edits to ship");
694       return;
695     }
696     while (this.isActive()) {
697       if (!isPeerEnabled()) {
698         if (sleepForRetries("Replication is disabled", sleepMultiplier)) {
699           sleepMultiplier++;
700         }
701         continue;
702       }
703       try {
704         AdminProtocol rrs = getRS();
705         ReplicationProtbufUtil.replicateWALEntry(rrs,
706             Arrays.copyOf(this.entriesArray, currentNbEntries));
707         if (this.lastLoggedPosition != this.repLogReader.getPosition()) {
708           this.manager.logPositionAndCleanOldLogs(this.currentPath,
709               this.peerClusterZnode, this.repLogReader.getPosition(),
710               queueRecovered, currentWALisBeingWrittenTo);
711           this.lastLoggedPosition = this.repLogReader.getPosition();
712         }
713         this.totalReplicatedEdits += currentNbEntries;
714         this.metrics.shipBatch(this.currentNbOperations);
715         this.metrics.setAgeOfLastShippedOp(
716             this.entriesArray[currentNbEntries-1].getKey().getWriteTime());
717         break;
718 
719       } catch (IOException ioe) {
720         // Didn't ship anything, but must still age the last time we did
721         this.metrics.refreshAgeOfLastShippedOp();
722         if (ioe instanceof RemoteException) {
723           ioe = ((RemoteException) ioe).unwrapRemoteException();
724           LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);
725         } else {
726           if (ioe instanceof SocketTimeoutException) {
727             // This exception means we waited for more than 60s and nothing
728             // happened, the cluster is alive and calling it right away
729             // even for a test just makes things worse.
730             sleepForRetries("Encountered a SocketTimeoutException. Since the " +
731               "call to the remote cluster timed out, which is usually " +
732               "caused by a machine failure or a massive slowdown",
733               this.socketTimeoutMultiplier);
734           } else if (ioe instanceof ConnectException) {
735             LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);
736             chooseSinks();
737           } else {
738             LOG.warn("Can't replicate because of a local or network error: ", ioe);
739           }
740         }
741 
742         try {
743           boolean down;
744           // Spin while the slave is down and we're not asked to shutdown/close
745           do {
746             down = isSlaveDown();
747             if (down) {
748               if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {
749                 sleepMultiplier++;
750               } else {
751                 chooseSinks();
752               }
753             }
754           } while (this.isActive() && down );
755         } catch (InterruptedException e) {
756           LOG.debug("Interrupted while trying to contact the peer cluster");
757         }
758       }
759     }
760   }
761 
762   /**
763    * check whether the peer is enabled or not
764    *
765    * @return true if the peer is enabled, otherwise false
766    */
767   protected boolean isPeerEnabled() {
768     return this.replicating.get() && this.zkHelper.getPeerEnabled(peerId);
769   }
770 
771   /**
772    * If the queue isn't empty, switch to the next one
773    * Else if this is a recovered queue, it means we're done!
774    * Else we'll just continue to try reading the log file
775    * @return true if we're done with the current file, false if we should
776    * continue trying to read from it
777    */
778   protected boolean processEndOfFile() {
779     if (this.queue.size() != 0) {
780       this.currentPath = null;
781       this.repLogReader.finishCurrentFile();
782       this.reader = null;
783       return true;
784     } else if (this.queueRecovered) {
785       this.manager.closeRecoveredQueue(this);
786       LOG.info("Finished recovering the queue");
787       this.running = false;
788       return true;
789     }
790     return false;
791   }
792 
793   public void startup() {
794     String n = Thread.currentThread().getName();
795     Thread.UncaughtExceptionHandler handler =
796         new Thread.UncaughtExceptionHandler() {
797           public void uncaughtException(final Thread t, final Throwable e) {
798             LOG.error("Unexpected exception in ReplicationSource," +
799               " currentPath=" + currentPath, e);
800           }
801         };
802     Threads.setDaemonThreadRunning(
803         this, n + ".replicationSource," + peerClusterZnode, handler);
804   }
805 
806   public void terminate(String reason) {
807     terminate(reason, null);
808   }
809 
810   public void terminate(String reason, Exception cause) {
811     if (cause == null) {
812       LOG.info("Closing source "
813           + this.peerClusterZnode + " because: " + reason);
814 
815     } else {
816       LOG.error("Closing source " + this.peerClusterZnode
817           + " because an error occurred: " + reason, cause);
818     }
819     this.running = false;
820     Threads.shutdown(this, this.sleepForRetries);
821   }
822 
823   /**
824    * Get a new region server at random from this peer
825    * @return
826    * @throws IOException
827    */
828   private AdminProtocol getRS() throws IOException {
829     if (this.currentPeers.size() == 0) {
830       throw new IOException(this.peerClusterZnode + " has 0 region servers");
831     }
832     ServerName address =
833         currentPeers.get(random.nextInt(this.currentPeers.size()));
834     return this.conn.getAdmin(address);
835   }
836 
837   /**
838    * Check if the slave is down by trying to establish a connection
839    * @return true if down, false if up
840    * @throws InterruptedException
841    */
842   public boolean isSlaveDown() throws InterruptedException {
843     final CountDownLatch latch = new CountDownLatch(1);
844     Thread pingThread = new Thread() {
845       public void run() {
846         try {
847           AdminProtocol rrs = getRS();
848           // Dummy call which should fail
849           ProtobufUtil.getServerInfo(rrs);
850           latch.countDown();
851         } catch (IOException ex) {
852           if (ex instanceof RemoteException) {
853             ex = ((RemoteException) ex).unwrapRemoteException();
854           }
855           LOG.info("Slave cluster looks down: " + ex.getMessage());
856         }
857       }
858     };
859     pingThread.start();
860     // awaits returns true if countDown happened
861     boolean down = ! latch.await(this.sleepForRetries, TimeUnit.MILLISECONDS);
862     pingThread.interrupt();
863     return down;
864   }
865 
866   public String getPeerClusterZnode() {
867     return this.peerClusterZnode;
868   }
869 
870   public String getPeerClusterId() {
871     return this.peerId;
872   }
873 
874   public Path getCurrentPath() {
875     return this.currentPath;
876   }
877 
878   private boolean isActive() {
879     return !this.stopper.isStopped() && this.running;
880   }
881 
882   /**
883    * Comparator used to compare logs together based on their start time
884    */
885   public static class LogsComparator implements Comparator<Path> {
886 
887     @Override
888     public int compare(Path o1, Path o2) {
889       return Long.valueOf(getTS(o1)).compareTo(getTS(o2));
890     }
891 
892     /**
893      * Split a path to get the start time
894      * For example: 10.20.20.171%3A60020.1277499063250
895      * @param p path to split
896      * @return start time
897      */
898     private long getTS(Path p) {
899       String[] parts = p.getName().split("\\.");
900       return Long.parseLong(parts[parts.length-1]);
901     }
902   }
903 
904   @Override
905   public String getStats() {
906     String position = "N/A";
907     try {
908       if (this.reader != null) {
909         position = this.reader.getPosition()+"";
910       }
911     } catch (IOException ioe) {
912     }
913     return "Total replicated edits: " + totalReplicatedEdits +
914       ", currently replicating from: " + this.currentPath +
915       " at position: " + position;
916   }
917 }