View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.client;
21  
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.hadoop.conf.Configuration;
26  import org.apache.hadoop.hbase.TableName;
27  import org.apache.hadoop.hbase.HConstants;
28  import org.apache.hadoop.hbase.HRegionLocation;
29  import org.apache.hadoop.hbase.DoNotRetryIOException;
30  import org.apache.hadoop.hbase.HConstants;
31  import org.apache.hadoop.hbase.HRegionLocation;
32  import org.apache.hadoop.hbase.protobuf.generated.Tracing;
33  import org.apache.hadoop.hbase.util.Bytes;
34  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
35  import org.apache.hadoop.hbase.util.Pair;
36  import org.cloudera.htrace.Span;
37  import org.cloudera.htrace.Trace;
38  
39  import java.io.IOException;
40  import java.io.InterruptedIOException;
41  import java.util.ArrayList;
42  import java.util.Arrays;
43  import java.util.HashMap;
44  import java.util.Iterator;
45  import java.util.List;
46  import java.util.Map;
47  import java.util.concurrent.ConcurrentHashMap;
48  import java.util.concurrent.ConcurrentMap;
49  import java.util.concurrent.ExecutorService;
50  import java.util.concurrent.RejectedExecutionException;
51  import java.util.concurrent.atomic.AtomicBoolean;
52  import java.util.concurrent.atomic.AtomicInteger;
53  import java.util.concurrent.atomic.AtomicLong;
54  
55  /**
56   * This class  allows a continuous flow of requests. It's written to be compatible with a
57   * synchronous caller such as HTable.
58   * <p>
59   * The caller sends a buffer of operation, by calling submit. This class extract from this list
60   * the operations it can send, i.e. the operations that are on region that are not considered
61   * as busy. The process is asynchronous, i.e. it returns immediately when if has finished to
62   * iterate on the list. If, and only if, the maximum number of current task is reached, the call
63   * to submit will block.
64   * </p>
65   * <p>
66   * The class manages internally the retries.
67   * </p>
68   * <p>
69   * The class includes an error marker: it allows to know if an operation has failed or not, and
70   * to get the exception details, i.e. the full list of throwables for each attempt. This marker
71   * is here to help the backward compatibility in HTable. In most (new) cases, it should be
72   * managed by the callbacks.
73   * </p>
74   * <p>
75   * A callback is available, in order to: <list>
76   * <li>Get the result of the operation (failure or success)</li>
77   * <li>When an operation fails but could be retried, allows or not to retry</li>
78   * <li>When an operation fails for good (can't be retried or already retried the maximum number
79   * time), register the error or not.
80   * </list>
81   * <p>
82   * This class is not thread safe externally; only one thread should submit operations at a time.
83   * Internally, the class is thread safe enough to manage simultaneously new submission and results
84   * arising from older operations.
85   * </p>
86   * <p>
87   * Internally, this class works with {@link Row}, this mean it could be theoretically used for
88   * gets as well.
89   * </p>
90   */
91  class AsyncProcess<CResult> {
92    private static final Log LOG = LogFactory.getLog(AsyncProcess.class);
93    protected final HConnection hConnection;
94    protected final TableName tableName;
95    protected final ExecutorService pool;
96    protected final AsyncProcessCallback<CResult> callback;
97    protected final BatchErrors errors = new BatchErrors();
98    protected final BatchErrors retriedErrors = new BatchErrors();
99    protected final AtomicBoolean hasError = new AtomicBoolean(false);
100   protected final AtomicLong tasksSent = new AtomicLong(0);
101   protected final AtomicLong tasksDone = new AtomicLong(0);
102   protected final ConcurrentMap<String, AtomicInteger> taskCounterPerRegion =
103       new ConcurrentHashMap<String, AtomicInteger>();
104   protected final int maxTotalConcurrentTasks;
105   protected final int maxConcurrentTasksPerRegion;
106   protected final long pause;
107   protected int numTries;
108   protected final boolean useServerTrackerForRetries;
109   protected int serverTrackerTimeout;
110   protected RpcRetryingCallerFactory rpcCallerFactory;
111 
112 
113   /**
114    * This interface allows to keep the interface of the previous synchronous interface, that uses
115    * an array of object to return the result.
116    * <p/>
117    * This interface allows the caller to specify the behavior on errors: <list>
118    * <li>If we have not yet reach the maximum number of retries, the user can nevertheless
119    * specify if this specific operation should be retried or not.
120    * </li>
121    * <li>If an operation fails (i.e. is not retried or fails after all retries), the user can
122    * specify is we should mark this AsyncProcess as in error or not.
123    * </li>
124    * </list>
125    */
126   interface AsyncProcessCallback<CResult> {
127 
128     /**
129      * Called on success. originalIndex holds the index in the action list.
130      */
131     void success(int originalIndex, byte[] region, Row row, CResult result);
132 
133     /**
134      * called on failure, if we don't retry (i.e. called once per failed operation).
135      *
136      * @return true if we should store the error and tag this async process as being in error.
137      *         false if the failure of this operation can be safely ignored, and does not require
138      *         the current process to be stopped without proceeding with the other operations in
139      *         the queue.
140      */
141     boolean failure(int originalIndex, byte[] region, Row row, Throwable t);
142 
143     /**
144      * Called on a failure we plan to retry. This allows the user to stop retrying. Will be
145      * called multiple times for a single action if it fails multiple times.
146      *
147      * @return false if we should retry, true otherwise.
148      */
149     boolean retriableFailure(int originalIndex, Row row, byte[] region, Throwable exception);
150   }
151 
152   private static class BatchErrors {
153     private List<Throwable> throwables = new ArrayList<Throwable>();
154     private List<Row> actions = new ArrayList<Row>();
155     private List<String> addresses = new ArrayList<String>();
156 
157     public void add(Throwable ex, Row row, HRegionLocation location) {
158       throwables.add(ex);
159       actions.add(row);
160       addresses.add(location != null ? location.getHostnamePort() : "null location");
161     }
162 
163     private RetriesExhaustedWithDetailsException makeException() {
164       return new RetriesExhaustedWithDetailsException(
165           new ArrayList<Throwable>(throwables),
166           new ArrayList<Row>(actions), new ArrayList<String>(addresses));
167     }
168 
169     public void clear() {
170       throwables.clear();
171       actions.clear();
172       addresses.clear();
173     }
174   }
175 
176   public AsyncProcess(HConnection hc, TableName tableName, ExecutorService pool,
177       AsyncProcessCallback<CResult> callback, Configuration conf, 
178       RpcRetryingCallerFactory rpcCaller) {
179     this.hConnection = hc;
180     this.tableName = tableName;
181     this.pool = pool;
182     this.callback = callback;
183 
184     this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
185         HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
186     this.numTries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
187         HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
188 
189     this.maxTotalConcurrentTasks = conf.getInt("hbase.client.max.total.tasks", 200);
190 
191     // With one, we ensure that the ordering of the queries is respected: we don't start
192     //  a set of operations on a region before the previous one is done. As well, this limits
193     //  the pressure we put on the region server.
194     this.maxConcurrentTasksPerRegion = conf.getInt("hbase.client.max.perregion.tasks", 1);
195 
196     this.useServerTrackerForRetries =
197         conf.getBoolean(HConnectionManager.RETRIES_BY_SERVER_KEY, true);
198 
199     if (this.useServerTrackerForRetries) {
200       // Server tracker allows us to do faster, and yet useful (hopefully), retries.
201       // However, if we are too useful, we might fail very quickly due to retry count limit.
202       // To avoid this, we are going to cheat for now (see HBASE-7659), and calculate maximum
203       // retry time if normal retries were used. Then we will retry until this time runs out.
204       // If we keep hitting one server, the net effect will be the incremental backoff, and
205       // essentially the same number of retries as planned. If we have to do faster retries,
206       // we will do more retries in aggregate, but the user will be none the wiser.
207       this.serverTrackerTimeout = 0;
208       for (int i = 0; i < this.numTries; ++i) {
209         serverTrackerTimeout += ConnectionUtils.getPauseTime(this.pause, i);
210       }
211     }
212 
213     this.rpcCallerFactory = rpcCaller;
214   }
215 
216   /**
217    * Extract from the rows list what we can submit. The rows we can not submit are kept in the
218    * list.
219    *
220    * @param rows - the submitted row. Modified by the method: we remove the rows we took.
221    * @param atLeastOne true if we should submit at least a subset.
222    */
223   public void submit(List<? extends Row> rows, boolean atLeastOne) throws InterruptedIOException {
224     if (rows.isEmpty()){
225       return;
226     }
227 
228     Map<HRegionLocation, MultiAction<Row>> actionsByServer =
229         new HashMap<HRegionLocation, MultiAction<Row>>();
230     List<Action<Row>> retainedActions = new ArrayList<Action<Row>>(rows.size());
231 
232     do {
233       Map<String, Boolean> regionIncluded = new HashMap<String, Boolean>();
234       long currentTaskNumber = waitForMaximumCurrentTasks(maxTotalConcurrentTasks);
235       int posInList = -1;
236       Iterator<? extends Row> it = rows.iterator();
237       while (it.hasNext()) {
238         Row r = it.next();
239         HRegionLocation loc = findDestLocation(r, 1, posInList, false, regionIncluded);
240 
241         if (loc != null) {   // loc is null if the dest is too busy or there is an error
242           Action<Row> action = new Action<Row>(r, ++posInList);
243           retainedActions.add(action);
244           addAction(loc, action, actionsByServer);
245           it.remove();
246         }
247       }
248 
249       if (retainedActions.isEmpty() && atLeastOne && !hasError()) {
250         waitForNextTaskDone(currentTaskNumber);
251       }
252 
253     } while (retainedActions.isEmpty() && atLeastOne && !hasError());
254 
255     HConnectionManager.ServerErrorTracker errorsByServer = createServerErrorTracker();
256     sendMultiAction(retainedActions, actionsByServer, 1, errorsByServer);
257   }
258 
259   /**
260    * Group the actions per region server.
261    *
262    * @param loc - the destination. Must not be null.
263    * @param action - the action to add to the multiaction
264    * @param actionsByServer the multiaction per server
265    */
266   private void addAction(HRegionLocation loc, Action<Row> action, Map<HRegionLocation,
267       MultiAction<Row>> actionsByServer) {
268     final byte[] regionName = loc.getRegionInfo().getRegionName();
269     MultiAction<Row> multiAction = actionsByServer.get(loc);
270     if (multiAction == null) {
271       multiAction = new MultiAction<Row>();
272       actionsByServer.put(loc, multiAction);
273     }
274 
275     multiAction.add(regionName, action);
276   }
277 
278   /**
279    * Find the destination, if this destination is not considered as busy.
280    *
281    * @param row          the row
282    * @param numAttempt   the num attempt
283    * @param posInList    the position in the list
284    * @param force        if we must submit whatever the server load
285    * @param regionStatus the
286    * @return null if we should not submit, the destination otherwise.
287    */
288   private HRegionLocation findDestLocation(Row row, int numAttempt,
289                                            int posInList, boolean force,
290                                            Map<String, Boolean> regionStatus) {
291     HRegionLocation loc = null;
292     IOException locationException = null;
293     try {
294       loc = hConnection.locateRegion(this.tableName, row.getRow());
295       if (loc == null) {
296         locationException = new IOException("No location found, aborting submit for" +
297             " tableName=" + tableName +
298             " rowkey=" + Arrays.toString(row.getRow()));
299       }
300     } catch (IOException e) {
301       locationException = e;
302     }
303     if (locationException != null) {
304       // There are multiple retries in locateRegion already. No need to add new.
305       // We can't continue with this row, hence it's the last retry.
306       manageError(numAttempt, posInList, row, false, locationException, null);
307       return null;
308     }
309 
310     if (force) {
311       return loc;
312     }
313 
314     String regionName = loc.getRegionInfo().getEncodedName();
315     Boolean addIt = regionStatus.get(regionName);
316     if (addIt == null) {
317       addIt = canTakeNewOperations(regionName);
318       regionStatus.put(regionName, addIt);
319     }
320 
321     return addIt ? loc : null;
322   }
323 
324 
325   /**
326    * Check if we should send new operations to this region.
327    *
328    * @param encodedRegionName region name
329    * @return true if this region is considered as busy.
330    */
331   protected boolean canTakeNewOperations(String encodedRegionName) {
332     AtomicInteger ct = taskCounterPerRegion.get(encodedRegionName);
333     return ct == null || ct.get() < maxConcurrentTasksPerRegion;
334   }
335 
336   /**
337    * Submit immediately the list of rows, whatever the server status. Kept for backward
338    * compatibility: it allows to be used with the batch interface that return an array of objects.
339    *
340    * @param rows the list of rows.
341    */
342   public void submitAll(List<? extends Row> rows) {
343     List<Action<Row>> actions = new ArrayList<Action<Row>>(rows.size());
344 
345     // The position will be used by the processBatch to match the object array returned.
346     int posInList = -1;
347     for (Row r : rows) {
348       posInList++;
349       Action<Row> action = new Action<Row>(r, posInList);
350       actions.add(action);
351     }
352     HConnectionManager.ServerErrorTracker errorsByServer = createServerErrorTracker();
353     submit(actions, actions, 1, true, errorsByServer);
354   }
355 
356 
357   /**
358    * Group a list of actions per region servers, and send them. The created MultiActions are
359    * added to the inProgress list.
360    *
361    * @param initialActions - the full list of the actions in progress
362    * @param currentActions - the list of row to submit
363    * @param numAttempt - the current numAttempt (first attempt is 1)
364    * @param force - true if we submit the rowList without taking into account the server load
365    */
366   private void submit(List<Action<Row>> initialActions,
367                       List<Action<Row>> currentActions, int numAttempt, boolean force,
368                       final HConnectionManager.ServerErrorTracker errorsByServer) {
369     // group per location => regions server
370     final Map<HRegionLocation, MultiAction<Row>> actionsByServer =
371         new HashMap<HRegionLocation, MultiAction<Row>>();
372 
373     // We have the same policy for a single region per call to submit: we don't want
374     //  to send half of the actions because the status changed in the middle. So we keep the
375     //  status
376     Map<String, Boolean> regionIncluded = new HashMap<String, Boolean>();
377 
378     for (Action<Row> action : currentActions) {
379       HRegionLocation loc = findDestLocation(
380           action.getAction(), 1, action.getOriginalIndex(), force, regionIncluded);
381 
382       if (loc != null) {
383         addAction(loc, action, actionsByServer);
384       }
385     }
386 
387     if (!actionsByServer.isEmpty()) {
388       sendMultiAction(initialActions, actionsByServer, numAttempt, errorsByServer);
389     }
390   }
391 
392   /**
393    * Send a multi action structure to the servers, after a delay depending on the attempt
394    * number. Asynchronous.
395    *
396    * @param initialActions  the list of the actions, flat.
397    * @param actionsByServer the actions structured by regions
398    * @param numAttempt      the attempt number.
399    */
400   public void sendMultiAction(final List<Action<Row>> initialActions,
401                               Map<HRegionLocation, MultiAction<Row>> actionsByServer,
402                               final int numAttempt,
403                               final HConnectionManager.ServerErrorTracker errorsByServer) {
404 
405     // Send the queries and add them to the inProgress list
406     for (Map.Entry<HRegionLocation, MultiAction<Row>> e : actionsByServer.entrySet()) {
407       final HRegionLocation loc = e.getKey();
408       final MultiAction<Row> multi = e.getValue();
409       final String regionName = loc.getRegionInfo().getEncodedName();
410 
411       incTaskCounters(regionName);
412 
413       Runnable runnable = Trace.wrap("AsyncProcess.sendMultiAction", new Runnable() {
414         @Override
415         public void run() {
416           MultiResponse res;
417           try {
418             MultiServerCallable<Row> callable = createCallable(loc, multi);
419             try {
420               res = createCaller(callable).callWithoutRetries(callable);
421             } catch (IOException e) {
422               LOG.warn("The call to the RS failed, we don't know where we stand. location="
423                   + loc, e);
424               resubmitAll(initialActions, multi, loc, numAttempt + 1, e, errorsByServer);
425               return;
426             }
427 
428             receiveMultiAction(initialActions, multi, loc, res, numAttempt, errorsByServer);
429           } finally {
430             decTaskCounters(regionName);
431           }
432         }
433       });
434 
435       try {
436         this.pool.submit(runnable);
437       } catch (RejectedExecutionException ree) {
438         // This should never happen. But as the pool is provided by the end user, let's secure
439         //  this a little.
440         decTaskCounters(regionName);
441         LOG.warn("The task was rejected by the pool. This is unexpected. " +
442             "location=" + loc, ree);
443         // We're likely to fail again, but this will increment the attempt counter, so it will
444         //  finish.
445         resubmitAll(initialActions, multi, loc, numAttempt + 1, ree, errorsByServer);
446       }
447     }
448   }
449 
450   /**
451    * Create a callable. Isolated to be easily overridden in the tests.
452    */
453   protected MultiServerCallable<Row> createCallable(final HRegionLocation location,
454       final MultiAction<Row> multi) {
455     return new MultiServerCallable<Row>(hConnection, tableName, location, multi);
456   }
457 
458   /**
459    * For tests.
460    * @param callable
461    * @return Returns a caller.
462    */
463   protected RpcRetryingCaller<MultiResponse> createCaller(MultiServerCallable<Row> callable) {
464     // callable is unused.
465     return rpcCallerFactory.<MultiResponse> newCaller();
466   }
467 
468   /**
469    * Check that we can retry acts accordingly: logs, set the error status, call the callbacks.
470    *
471    * @param numAttempt    the number of this attempt
472    * @param originalIndex the position in the list sent
473    * @param row           the row
474    * @param canRetry      if false, we won't retry whatever the settings.
475    * @param throwable     the throwable, if any (can be null)
476    * @param location      the location, if any (can be null)
477    * @return true if the action can be retried, false otherwise.
478    */
479   private boolean manageError(int numAttempt, int originalIndex, Row row, boolean canRetry,
480                               Throwable throwable, HRegionLocation location) {
481     if (canRetry) {
482       if (numAttempt >= numTries ||
483           (throwable != null && throwable instanceof DoNotRetryIOException)) {
484         canRetry = false;
485       }
486     }
487     byte[] region = location == null ? null : location.getRegionInfo().getEncodedNameAsBytes();
488 
489     if (canRetry && callback != null) {
490       canRetry = callback.retriableFailure(originalIndex, row, region, throwable);
491     }
492 
493     if (canRetry) {
494       if (LOG.isTraceEnabled()) {
495         retriedErrors.add(throwable, row, location);
496       }
497     } else {
498       if (callback != null) {
499         callback.failure(originalIndex, region, row, throwable);
500       }
501       this.hasError.set(true);
502       errors.add(throwable, row, location);
503     }
504 
505     return canRetry;
506   }
507 
508   /**
509    * Resubmit all the actions from this multiaction after a failure.
510    *
511    * @param initialActions the full initial action list
512    * @param rsActions  the actions still to do from the initial list
513    * @param location   the destination
514    * @param numAttempt the number of attempts so far
515    * @param t the throwable (if any) that caused the resubmit
516    */
517   private void resubmitAll(List<Action<Row>> initialActions, MultiAction<Row> rsActions,
518                            HRegionLocation location, int numAttempt, Throwable t,
519                            HConnectionManager.ServerErrorTracker errorsByServer) {
520     // Do not use the exception for updating cache because it might be coming from
521     // any of the regions in the MultiAction.
522     hConnection.updateCachedLocations(tableName,
523         rsActions.actions.values().iterator().next().get(0).getAction().getRow(), null, location);
524     errorsByServer.reportServerError(location);
525 
526     List<Action<Row>> toReplay = new ArrayList<Action<Row>>();
527     for (List<Action<Row>> actions : rsActions.actions.values()) {
528       for (Action<Row> action : actions) {
529         if (manageError(numAttempt, action.getOriginalIndex(), action.getAction(),
530             true, t, location)) {
531           toReplay.add(action);
532         }
533       }
534     }
535 
536     if (toReplay.isEmpty()) {
537       LOG.warn("Attempt #" + numAttempt + "/" + numTries + " failed for all (" +
538           initialActions.size() + ") operations on server " + location.getServerName() +
539           " NOT resubmitting, tableName=" + tableName + ", location=" + location);
540     } else {
541       submit(initialActions, toReplay, numAttempt, true, errorsByServer);
542     }
543   }
544 
545   /**
546    * Called when we receive the result of a server query.
547    *
548    * @param initialActions - the whole action list
549    * @param rsActions      - the actions for this location
550    * @param location       - the location
551    * @param responses      - the response, if any
552    * @param numAttempt     - the attempt
553    */
554   private void receiveMultiAction(List<Action<Row>> initialActions,
555                                   MultiAction<Row> rsActions, HRegionLocation location,
556                                   MultiResponse responses, int numAttempt,
557                                   HConnectionManager.ServerErrorTracker errorsByServer) {
558 
559     if (responses == null) {
560       LOG.info("Attempt #" + numAttempt + "/" + numTries + " failed for all operations" +
561           " on server " + location.getServerName() + " , trying to resubmit," +
562           " tableName=" + tableName + ", location=" + location);
563       resubmitAll(initialActions, rsActions, location, numAttempt + 1, null, errorsByServer);
564       return;
565     }
566 
567     // Success or partial success
568     // Analyze detailed results. We can still have individual failures to be redo.
569     // two specific throwables are managed:
570     //  - DoNotRetryIOException: we continue to retry for other actions
571     //  - RegionMovedException: we update the cache with the new region location
572 
573     List<Action<Row>> toReplay = new ArrayList<Action<Row>>();
574     Throwable throwable = null;
575 
576     int failureCount = 0;
577     boolean canRetry = true;
578     for (Map.Entry<byte[], List<Pair<Integer, Object>>> resultsForRS :
579         responses.getResults().entrySet()) {
580 
581       for (Pair<Integer, Object> regionResult : resultsForRS.getValue()) {
582         Object result = regionResult.getSecond();
583 
584         // Failure: retry if it's make sense else update the errors lists
585         if (result == null || result instanceof Throwable) {
586           throwable = (Throwable) result;
587           Action<Row> correspondingAction = initialActions.get(regionResult.getFirst());
588           Row row = correspondingAction.getAction();
589 
590           if (failureCount++ == 0) { // We're doing this once per location.
591             hConnection.updateCachedLocations(this.tableName, row.getRow(), result, location);
592             if (errorsByServer != null) {
593               errorsByServer.reportServerError(location);
594               canRetry = errorsByServer.canRetryMore();
595             }
596           }
597 
598           if (manageError(numAttempt, correspondingAction.getOriginalIndex(), row, canRetry,
599               throwable, location)) {
600             toReplay.add(correspondingAction);
601           }
602         } else { // success
603           if (callback != null) {
604             Action<Row> correspondingAction = initialActions.get(regionResult.getFirst());
605             Row row = correspondingAction.getAction();
606             //noinspection unchecked
607             this.callback.success(correspondingAction.getOriginalIndex(),
608                 resultsForRS.getKey(), row, (CResult) result);
609           }
610         }
611       }
612     }
613 
614     if (!toReplay.isEmpty()) {
615       long backOffTime = (errorsByServer != null ?
616           errorsByServer.calculateBackoffTime(location, pause) :
617           ConnectionUtils.getPauseTime(pause, numAttempt));
618       if (numAttempt > 3 && LOG.isDebugEnabled()) {
619         // We use this value to have some logs when we have multiple failures, but not too many
620         //  logs as errors are to be expected wehn region moves, split and so on
621         LOG.debug("Attempt #" + numAttempt + "/" + numTries + " failed for " + failureCount +
622             " operations on server " + location.getServerName() + ", resubmitting " +
623             toReplay.size() + ", tableName=" + tableName + ", location=" +
624             location + ", last exception was: " + throwable +
625             " - sleeping " + backOffTime + " ms.");
626       }
627       try {
628         Thread.sleep(backOffTime);
629       } catch (InterruptedException e) {
630         LOG.warn("Not sent: " + toReplay.size() +
631             " operations,  tableName=" + tableName + ", location=" + location, e);
632         Thread.interrupted();
633         return;
634       }
635 
636       submit(initialActions, toReplay, numAttempt + 1, true, errorsByServer);
637     } else if (failureCount != 0) {
638       LOG.warn("Attempt #" + numAttempt + "/" + numTries + " failed for " + failureCount +
639           " operations on server " + location.getServerName() + " NOT resubmitting." +
640           ", tableName=" + tableName + ", location=" + location);
641     }
642   }
643 
644   /**
645    * Waits for another task to finish.
646    * @param currentNumberOfTask - the number of task finished when calling the method.
647    */
648   protected void waitForNextTaskDone(long currentNumberOfTask) throws InterruptedIOException {
649     while (currentNumberOfTask == tasksDone.get()) {
650       try {
651         synchronized (this.tasksDone) {
652           this.tasksDone.wait(100);
653         }
654       } catch (InterruptedException e) {
655         throw new InterruptedIOException("Interrupted." +
656             " currentNumberOfTask=" + currentNumberOfTask +
657             ",  tableName=" + tableName + ", tasksDone=" + tasksDone.get());
658       }
659     }
660   }
661 
662   /**
663    * Wait until the async does not have more than max tasks in progress.
664    */
665   private long waitForMaximumCurrentTasks(int max) throws InterruptedIOException {
666     long lastLog = EnvironmentEdgeManager.currentTimeMillis();
667     long currentTasksDone = this.tasksDone.get();
668 
669     while ((tasksSent.get() - currentTasksDone) > max) {
670       long now = EnvironmentEdgeManager.currentTimeMillis();
671       if (now > lastLog + 10000) {
672         lastLog = now;
673         LOG.info(": Waiting for the global number of running tasks to be equals or less than "
674             + max + ", tasksSent=" + tasksSent.get() + ", tasksDone=" + tasksDone.get() +
675             ", currentTasksDone=" + currentTasksDone + ", tableName=" + tableName);
676       }
677       waitForNextTaskDone(currentTasksDone);
678       currentTasksDone = this.tasksDone.get();
679     }
680 
681     return currentTasksDone;
682   }
683 
684   /**
685    * Wait until all tasks are executed, successfully or not.
686    */
687   public void waitUntilDone() throws InterruptedIOException {
688     waitForMaximumCurrentTasks(0);
689   }
690 
691 
692   public boolean hasError() {
693     return hasError.get();
694   }
695 
696   public List<? extends Row> getFailedOperations() {
697     return errors.actions;
698   }
699 
700   /**
701    * Clean the errors stacks. Should be called only when there are no actions in progress.
702    */
703   public void clearErrors() {
704     errors.clear();
705     retriedErrors.clear();
706     hasError.set(false);
707   }
708 
709   public RetriesExhaustedWithDetailsException getErrors() {
710     return errors.makeException();
711   }
712 
713   /**
714    * incrementer the tasks counters for a given region. MT safe.
715    */
716   protected void incTaskCounters(String encodedRegionName) {
717     tasksSent.incrementAndGet();
718 
719     AtomicInteger counterPerServer = taskCounterPerRegion.get(encodedRegionName);
720     if (counterPerServer == null) {
721       taskCounterPerRegion.putIfAbsent(encodedRegionName, new AtomicInteger());
722       counterPerServer = taskCounterPerRegion.get(encodedRegionName);
723     }
724     counterPerServer.incrementAndGet();
725   }
726 
727   /**
728    * Decrements the counters for a given region
729    */
730   protected void decTaskCounters(String encodedRegionName) {
731     AtomicInteger counterPerServer = taskCounterPerRegion.get(encodedRegionName);
732     counterPerServer.decrementAndGet();
733 
734     tasksDone.incrementAndGet();
735     synchronized (tasksDone) {
736       tasksDone.notifyAll();
737     }
738   }
739 
740   /**
741    * Creates the server error tracker to use inside process.
742    * Currently, to preserve the main assumption about current retries, and to work well with
743    * the retry-limit-based calculation, the calculation is local per Process object.
744    * We may benefit from connection-wide tracking of server errors.
745    * @return ServerErrorTracker to use, null if there is no ServerErrorTracker on this connection
746    */
747   protected HConnectionManager.ServerErrorTracker createServerErrorTracker() {
748     if (useServerTrackerForRetries){
749       return new HConnectionManager.ServerErrorTracker(this.serverTrackerTimeout);
750     }else {
751       return null;
752     }
753   }
754 }